From cee75bff6a52bbb4c599dfb951e21c2b2c0449a1 Mon Sep 17 00:00:00 2001
From: Benoit Favre <benoit.favre@lis-lab.fr>
Date: Wed, 10 Apr 2024 16:14:44 +0200
Subject: [PATCH] update checkpoints

---
 README.md | 64 +++++++++++++++++++++++++++----------------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index c8bdca3..7bfcb67 100644
--- a/README.md
+++ b/README.md
@@ -56,54 +56,54 @@ Models
 
 All models are trained from the 1st 100M tokens from [Common Crawl](http://data.statmt.org/cc-100/)
 
-[checkpoints/it.22000](https://github.com/CoffeePerry/recasepunc/releases/download/v0.1.0/it.22000)
+[checkpoints/it.23000](https://github.com/benob/recasepunc/releases/download/v0.4/it.23000)
 ```
 {
-  "iteration": "22000",
-  "train_loss": "0.058934884114190934",
-  "valid_loss": "0.06988634882792658",
-  "valid_accuracy_case": "0.9575860089785607",
-  "valid_accuracy_punc": "0.940614491584733",
-  "valid_fscore": "{0: 0.6431694030761719, 1: 0.6150795817375183, 2: 0.7023577094078064, 3: 0.5514711737632751, 4: 0.21250930428504944}",
-  "config": "{'seed': 871253, 'lang': 'it', 'flavor': 'dbmdz/bert-base-italian-uncased', 'max_length': 256, 'batch_size': 4, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/it-100M.train.x', 'data/it-100M.train.y', 'data/it-100M.valid.x', 'data/it-100M.valid.y', 'checkpoints/it'], 'pad_token_id': 0, 'cls_token_id': 102, 'cls_token': '[CLS]', 'sep_token_id': 103, 'sep_token': '[SEP]'}"
+  "iteration": "23000",
+  "train_loss": "0.015077149430289864",
+  "valid_loss": "0.021484553813934326",
+  "valid_accuracy_case": "0.9517227564102564",
+  "valid_accuracy_punc": "0.9359975961538461",
+  "valid_fscore": "{0: 0.6016615629196167, 1: 0.6202345490455627, 2: 0.6219512224197388, 3: 0.42424243688583374, 4: 0.08571428805589676}",
+  "config": "{'seed': 871253, 'lang': 'it', 'flavor': 'dbmdz/bert-base-italian-uncased', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/it-100M.train.x', 'data/it-100M.train.y', 'data/it-100M.valid.x', 'data/it-100M.valid.y', 'checkpoints/it'], 'pad_token_id': 0, 'cls_token_id': 102, 'cls_token': '[CLS]', 'sep_token_id': 103, 'sep_token': '[SEP]'}"
 }
 ```
 
-[checkpoints/zh.24000](https://github.com/benob/recasepunc/releases/download/0.3/zh.24000)
+[checkpoints/zh-Hant.17000](https://github.com/benob/recasepunc/releases/download/0.4/zh-Hant.17000)
 ```
 {
-  "iteration": "24000",
-  "train_loss": "0.006788245493080467",
-  "valid_loss": "0.007345725328494341",
-  "valid_accuracy_case": "0.9963942307692307",
-  "valid_accuracy_punc": "0.9692508012820513",
-  "valid_fscore": "{0: 0.7727023363113403, 1: 0.7901785373687744, 2: 0.7293065190315247, 3: 0.7692307829856873, 4: 0.4615384638309479}",
-  "config": "{'seed': 871253, 'lang': 'zh', 'flavor': 'ckiplab/bert-base-chinese', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/zh-100M.train.x', 'data/zh-100M.train.y', 'data/zh-100M.valid.x', 'data/zh-100M.valid.y', 'checkpoints/zh'], 'pad_token_id': 0, 'cls_token_id': 101, 'cls_token': '[CLS]', 'sep_token_id': 102, 'sep_token': '[SEP]'}"
+  "iteration": "17000",
+  "train_loss": "0.007012549160048366",
+  "valid_loss": "0.007463883130978315",
+  "valid_accuracy_case": "0.9967948717948718",
+  "valid_accuracy_punc": "0.9682491987179487",
+  "valid_fscore": "{0: 0.7668336033821106, 1: 0.7813194990158081, 2: 0.7200000286102295, 3: 0.8333333730697632, 4: 0.7272727489471436}",
+  "config": "{'seed': 871253, 'lang': 'zh-Hant', 'flavor': 'ckiplab/bert-base-chinese', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/zh-Hant-100M.train.x', 'data/zh-Hant-100M.train.y', 'data/zh-Hant-100M.valid.x', 'data/zh-Hant-100M.valid.y', 'checkpoints/zh-Hant'], 'pad_token_id': 0, 'cls_token_id': 101, 'cls_token': '[CLS]', 'sep_token_id': 102, 'sep_token': '[SEP]'}"
 }
 ```
 
-[checkpoints/en.23000](https://github.com/benob/recasepunc/releases/download/0.3/en.23000)
+[checkpoints/en.22000](https://github.com/benob/recasepunc/releases/download/0.4/en.22000)
 ```
 {
-  "iteration": "23000",
-  "train_loss": "0.014598741472698748",
-  "valid_loss": "0.025432642453756087",
-  "valid_accuracy_case": "0.9407051282051282",
-  "valid_accuracy_punc": "0.9401041666666666",
-  "valid_fscore": "{0: 0.6455026268959045, 1: 0.5925925970077515, 2: 0.7243649959564209, 3: 0.7027027010917664, 4: 0.03921568766236305}",                                                    
-  "config": "{'seed': 871253, 'lang': 'en', 'flavor': 'bert-base-uncased', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/en-100M.train.x', 'data/en-100M.train.y', 'data/en-100M.valid.x', 'data/en-100M.valid.y', 'checkpoints/en'], 'pad_token_id': 0, 'cls_token_id': 101, 'cls_token': '[CLS]', 'sep_token_id': 102, 'sep_token': '[SEP]'}"                                                                                           
+  "iteration": "22000",
+  "train_loss": "0.01467611983884126",
+  "valid_loss": "0.02559371789296468",
+  "valid_accuracy_case": "0.9393028846153846",
+  "valid_accuracy_punc": "0.9404046474358975",
+  "valid_fscore": "{0: 0.6431096196174622, 1: 0.603951096534729, 2: 0.7078340649604797, 3: 0.6865671277046204, 4: 0}",
+  "config": "{'seed': 871253, 'lang': 'en', 'flavor': 'bert-base-uncased', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/en-100M.train.x', 'data/en-100M.train.y', 'data/en-100M.valid.x', 'data/en-100M.valid.y', 'checkpoints/en'], 'pad_token_id': 0, 'cls_token_id': 101, 'cls_token': '[CLS]', 'sep_token_id': 102, 'sep_token': '[SEP]'}"
 }
 ```
 
-[checkpoints/fr.22000](https://github.com/benob/recasepunc/releases/download/0.3/fr.22000)
+[checkpoints/fr.24000](https://github.com/benob/recasepunc/releases/download/0.4/fr.24000)
 ```
 {
-  "iteration": "22000",
-  "train_loss": "0.02052250287961215",
-  "valid_loss": "0.009240646392871171",
-  "valid_accuracy_case": "0.9881810897435898",
-  "valid_accuracy_punc": "0.9683493589743589",
-  "valid_fscore": "{0: 0.802524745464325, 1: 0.7892595529556274, 2: 0.8360477685928345, 3: 0.8717948198318481, 4: 0.2068965584039688}",
+  "iteration": "24000",
+  "train_loss": "0.015482447233051061",
+  "valid_loss": "0.006200919071069131",
+  "valid_accuracy_case": "1.0",
+  "valid_accuracy_punc": "0.9691506410256411",
+  "valid_fscore": "{0: 0.8114132881164551, 1: 0.7968379855155945, 2: 0.8446389436721802, 3: 0.8421052694320679, 4: 0.3076923191547394}",
   "config": "{'seed': 871253, 'lang': 'fr', 'flavor': 'flaubert/flaubert_base_uncased', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/fr-100M.train.x', 'data/fr-100M.train.y', 'data/fr-100M.valid.x', 'data/fr-100M.valid.y', 'checkpoints/fr'], 'pad_token_id': 2, 'cls_token_id': 0, 'cls_token': '<s>', 'sep_token_id': 1, 'sep_token': '</s>'}"
 }
 ```
@@ -139,9 +139,9 @@ python recasepunc.py eval test.x test.y checkpoint/path.iteration
 Two scripts used to create the models are given as example of how to train for a new language:
 * `./prepare.sh <lang>` for downloading data, creating sets, and preprocessing
 * `./train.sh <lang>` for trainging the model
+
 Both assume the availability of a `env.sh` script for loading the environment and setting up stuff.
 `requirements.freeze.txt` contains the package versions used for training.
-
 You will need to modify recasepunc.py and set the BERT flavior for the new language and check that the tokenizer correctly handles punctuation. For French, we had to patch the tokenizer to keep input/punctuation synchronized.
 
 Notes