-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathft_data.sh
50 lines (38 loc) · 1.52 KB
/
ft_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/bin/bash
# Usage : ft_data.sh $languages
set -e
languages=$1
# File containing the codes and vocabularies from the previous meta-processing.
CODE_VOCAB_PATH=
# path where processed files will be stored
OUTPATH=/content/processed
# If parallel data is available and you need to preprocess it
PARA=True
# If you want to process monolingual data (if the monolingual data is unavailable and you
# leave this parameter set to True, the parallel data will be used to build the monolingual data)
MONO=True
# folder containing the parallel data
PARA_PATH=/content/data/para
# folder containing the monolingual data
MONO_PATH=/content/data/para
# Percentage of data to use as test data (%)
test_size=10
# Percentage of data to use as validation data (%)
val_size=10
# tools paths
TOOLS_PATH=tools
TOKENIZE=$TOOLS_PATH/tokenizer_our.sh
LOWER_REMOVE_ACCENT=$TOOLS_PATH/lowercase_and_remove_accent.py
FASTBPE=$TOOLS_PATH/fastBPE/fast
# create output path
mkdir -p $OUTPATH
# avoid permission error
chmod +x $FASTBPE
chmod +x $TOOLS_PATH/mosesdecoder/scripts/tokenizer/*.perl
# The n_sample parameter is optional, and when it is not passed or when it exceeds the dataset size, the whole dataset is considered
n_samples=-1
# transform (tokenize, lower and remove accent, loard code and vocab, apply BPE tokenization, binarize...) our data contained
# in the text files into a pth file understandable by the framework.
# Let's consider the sub-task en-fr.
chmod +x build_fine_tune_data.sh
. ../scripts/build_fine_tune_data.sh $languages $n_samples