From cee75bff6a52bbb4c599dfb951e21c2b2c0449a1 Mon Sep 17 00:00:00 2001 From: Benoit Favre Date: Wed, 10 Apr 2024 16:14:44 +0200 Subject: [PATCH] update checkpoints --- README.md | 64 +++++++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index c8bdca3..7bfcb67 100644 --- a/README.md +++ b/README.md @@ -56,54 +56,54 @@ Models All models are trained from the 1st 100M tokens from [Common Crawl](http://data.statmt.org/cc-100/) -[checkpoints/it.22000](https://github.com/CoffeePerry/recasepunc/releases/download/v0.1.0/it.22000) +[checkpoints/it.23000](https://github.com/benob/recasepunc/releases/download/v0.4/it.23000) ``` { - "iteration": "22000", - "train_loss": "0.058934884114190934", - "valid_loss": "0.06988634882792658", - "valid_accuracy_case": "0.9575860089785607", - "valid_accuracy_punc": "0.940614491584733", - "valid_fscore": "{0: 0.6431694030761719, 1: 0.6150795817375183, 2: 0.7023577094078064, 3: 0.5514711737632751, 4: 0.21250930428504944}", - "config": "{'seed': 871253, 'lang': 'it', 'flavor': 'dbmdz/bert-base-italian-uncased', 'max_length': 256, 'batch_size': 4, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/it-100M.train.x', 'data/it-100M.train.y', 'data/it-100M.valid.x', 'data/it-100M.valid.y', 'checkpoints/it'], 'pad_token_id': 0, 'cls_token_id': 102, 'cls_token': '[CLS]', 'sep_token_id': 103, 'sep_token': '[SEP]'}" + "iteration": "23000", + "train_loss": "0.015077149430289864", + "valid_loss": "0.021484553813934326", + "valid_accuracy_case": "0.9517227564102564", + "valid_accuracy_punc": "0.9359975961538461", + "valid_fscore": "{0: 0.6016615629196167, 1: 0.6202345490455627, 2: 0.6219512224197388, 3: 0.42424243688583374, 4: 0.08571428805589676}", + "config": "{'seed': 871253, 'lang': 'it', 'flavor': 'dbmdz/bert-base-italian-uncased', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/it-100M.train.x', 'data/it-100M.train.y', 'data/it-100M.valid.x', 'data/it-100M.valid.y', 'checkpoints/it'], 'pad_token_id': 0, 'cls_token_id': 102, 'cls_token': '[CLS]', 'sep_token_id': 103, 'sep_token': '[SEP]'}" } ``` -[checkpoints/zh.24000](https://github.com/benob/recasepunc/releases/download/0.3/zh.24000) +[checkpoints/zh-Hant.17000](https://github.com/benob/recasepunc/releases/download/0.4/zh-Hant.17000) ``` { - "iteration": "24000", - "train_loss": "0.006788245493080467", - "valid_loss": "0.007345725328494341", - "valid_accuracy_case": "0.9963942307692307", - "valid_accuracy_punc": "0.9692508012820513", - "valid_fscore": "{0: 0.7727023363113403, 1: 0.7901785373687744, 2: 0.7293065190315247, 3: 0.7692307829856873, 4: 0.4615384638309479}", - "config": "{'seed': 871253, 'lang': 'zh', 'flavor': 'ckiplab/bert-base-chinese', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/zh-100M.train.x', 'data/zh-100M.train.y', 'data/zh-100M.valid.x', 'data/zh-100M.valid.y', 'checkpoints/zh'], 'pad_token_id': 0, 'cls_token_id': 101, 'cls_token': '[CLS]', 'sep_token_id': 102, 'sep_token': '[SEP]'}" + "iteration": "17000", + "train_loss": "0.007012549160048366", + "valid_loss": "0.007463883130978315", + "valid_accuracy_case": "0.9967948717948718", + "valid_accuracy_punc": "0.9682491987179487", + "valid_fscore": "{0: 0.7668336033821106, 1: 0.7813194990158081, 2: 0.7200000286102295, 3: 0.8333333730697632, 4: 0.7272727489471436}", + "config": "{'seed': 871253, 'lang': 'zh-Hant', 'flavor': 'ckiplab/bert-base-chinese', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/zh-Hant-100M.train.x', 'data/zh-Hant-100M.train.y', 'data/zh-Hant-100M.valid.x', 'data/zh-Hant-100M.valid.y', 'checkpoints/zh-Hant'], 'pad_token_id': 0, 'cls_token_id': 101, 'cls_token': '[CLS]', 'sep_token_id': 102, 'sep_token': '[SEP]'}" } ``` -[checkpoints/en.23000](https://github.com/benob/recasepunc/releases/download/0.3/en.23000) +[checkpoints/en.22000](https://github.com/benob/recasepunc/releases/download/0.4/en.22000) ``` { - "iteration": "23000", - "train_loss": "0.014598741472698748", - "valid_loss": "0.025432642453756087", - "valid_accuracy_case": "0.9407051282051282", - "valid_accuracy_punc": "0.9401041666666666", - "valid_fscore": "{0: 0.6455026268959045, 1: 0.5925925970077515, 2: 0.7243649959564209, 3: 0.7027027010917664, 4: 0.03921568766236305}", - "config": "{'seed': 871253, 'lang': 'en', 'flavor': 'bert-base-uncased', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/en-100M.train.x', 'data/en-100M.train.y', 'data/en-100M.valid.x', 'data/en-100M.valid.y', 'checkpoints/en'], 'pad_token_id': 0, 'cls_token_id': 101, 'cls_token': '[CLS]', 'sep_token_id': 102, 'sep_token': '[SEP]'}" + "iteration": "22000", + "train_loss": "0.01467611983884126", + "valid_loss": "0.02559371789296468", + "valid_accuracy_case": "0.9393028846153846", + "valid_accuracy_punc": "0.9404046474358975", + "valid_fscore": "{0: 0.6431096196174622, 1: 0.603951096534729, 2: 0.7078340649604797, 3: 0.6865671277046204, 4: 0}", + "config": "{'seed': 871253, 'lang': 'en', 'flavor': 'bert-base-uncased', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/en-100M.train.x', 'data/en-100M.train.y', 'data/en-100M.valid.x', 'data/en-100M.valid.y', 'checkpoints/en'], 'pad_token_id': 0, 'cls_token_id': 101, 'cls_token': '[CLS]', 'sep_token_id': 102, 'sep_token': '[SEP]'}" } ``` -[checkpoints/fr.22000](https://github.com/benob/recasepunc/releases/download/0.3/fr.22000) +[checkpoints/fr.24000](https://github.com/benob/recasepunc/releases/download/0.4/fr.24000) ``` { - "iteration": "22000", - "train_loss": "0.02052250287961215", - "valid_loss": "0.009240646392871171", - "valid_accuracy_case": "0.9881810897435898", - "valid_accuracy_punc": "0.9683493589743589", - "valid_fscore": "{0: 0.802524745464325, 1: 0.7892595529556274, 2: 0.8360477685928345, 3: 0.8717948198318481, 4: 0.2068965584039688}", + "iteration": "24000", + "train_loss": "0.015482447233051061", + "valid_loss": "0.006200919071069131", + "valid_accuracy_case": "1.0", + "valid_accuracy_punc": "0.9691506410256411", + "valid_fscore": "{0: 0.8114132881164551, 1: 0.7968379855155945, 2: 0.8446389436721802, 3: 0.8421052694320679, 4: 0.3076923191547394}", "config": "{'seed': 871253, 'lang': 'fr', 'flavor': 'flaubert/flaubert_base_uncased', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/fr-100M.train.x', 'data/fr-100M.train.y', 'data/fr-100M.valid.x', 'data/fr-100M.valid.y', 'checkpoints/fr'], 'pad_token_id': 2, 'cls_token_id': 0, 'cls_token': '', 'sep_token_id': 1, 'sep_token': ''}" } ``` @@ -139,9 +139,9 @@ python recasepunc.py eval test.x test.y checkpoint/path.iteration Two scripts used to create the models are given as example of how to train for a new language: * `./prepare.sh ` for downloading data, creating sets, and preprocessing * `./train.sh ` for trainging the model + Both assume the availability of a `env.sh` script for loading the environment and setting up stuff. `requirements.freeze.txt` contains the package versions used for training. - You will need to modify recasepunc.py and set the BERT flavior for the new language and check that the tokenizer correctly handles punctuation. For French, we had to patch the tokenizer to keep input/punctuation synchronized. Notes