-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfind_dialect_features.py
109 lines (93 loc) · 4.28 KB
/
find_dialect_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from collections import Counter
IN_FILE_BOKMAAL_PHONO = 'data/bokmaal+phon_cleaned.tsv'
# IN_FILE_BOKMAAL = 'data/bokmaal_cleaned.tsv'
OUT_FILE = 'data/dialect_features.txt'
ikke, noe, noen, mye = [], [], [], []
jeg, hun, vi, dere, de = [], [], [], [], []
hva, hvem, hvorfor, når, hvordan, åssen = [], [], [], [], [], []
vaere, kunne, skulle, ville, måtte = [], [], [], [], []
gjøre, komme, bruke, skrive, legge = [], [], [], [], []
tid, side, bygd, klokke, klasse = [], [], [], [], []
uke, hytte, helg, elv, søster = [], [], [], [], []
side_sg, klokke_sg, klasse_sg, uke_sg, hytte_sg = [], [], [], [], []
# more ending in -e:
kirke, påske, stue, kone, gate, grense = [], [], [], [], [], []
nouns = ['tida', 'sida', 'bygda', 'klokka', 'klassa',
'uka', 'hytta', 'helga', 'elva', 'søstera',
'kirka', 'påska', 'stua', 'kona', 'gata', 'grensa']
rs, sl = [], []
word2list = {'ikke': ikke, 'noe': noe, 'noen': noen, 'mye': mye,
'jeg': jeg, 'hun': hun, 'vi': vi, 'dere': dere, 'de': de,
'hva': hva, 'hvem': hvem, 'hvorfor': hvorfor, 'når': når,
'hvordan': hvordan, 'åssen': åssen,
'være': vaere, 'kunne': kunne, 'skulle': skulle,
'ville': ville, 'måtte': måtte,
'gjøre': gjøre, 'komme': komme, 'bruke': bruke,
'skrive': skrive, 'legge': legge,
'side': side_sg, 'klokke': klokke_sg, 'klasse': klasse_sg,
'uke': uke_sg, 'hytte': hytte_sg
}
noun2list = {'tida': tid, 'sida': side, 'bygda': bygd, 'klokka': klokke,
'klassa': klasse, 'uka': uke, 'hytta': hytte, 'helga': helg,
'elva': elv, 'søstera': søster, 'kirka': kirke,
'påska': påske, 'stua': stue, 'kona': kone, 'gata': gate,
'grensa': grense,
'tiden': tid, 'siden': side, 'bygden': bygd, 'klokken': klokke,
'klassen': klasse, 'uken': uke, 'hytten': hytte, 'helgen': helg,
'elven': elv, 'søsteren': søster, 'kirken': kirke,
'påsken': påske, 'stuen': stue, 'konen': kone, 'gaten': gate,
'grensen': grense}
# this only finds infinitives if they are preceded by 'å'
infinitives = []
infinitive_realizations = {}
with open(IN_FILE_BOKMAAL_PHONO, 'r', encoding='utf8') as f:
for line in f:
utterance = line.strip().split('\t')[4]
tokens = utterance.split(' ')
infinitive = False
for idx, token in enumerate(tokens):
bokmaal, phon = token.split('/')
bokmaal = bokmaal.lower()
try:
word2list[bokmaal].append(phon)
except KeyError:
pass
try:
noun2list[bokmaal].append(phon)
except KeyError:
pass
if 'rs' in bokmaal:
if 'rs' in phon and 'ʂ' not in phon:
rs.append('rs')
elif 'ʂ' in phon and 'rs' not in phon:
rs.append('ʂ')
if 'sl' in bokmaal:
if 'sl' in phon and 'ʂl' not in phon:
sl.append('sl')
elif 'ʂl' in phon and 'sl' not in phon:
sl.append('ʂl')
if infinitive:
infinitives.append(bokmaal)
try:
infinitive_realizations[bokmaal].append(phon)
except KeyError:
infinitive_realizations[bokmaal] = [phon]
infinitive = bokmaal == 'å'
word2list['rs'] = rs
word2list['sl'] = sl
with open(OUT_FILE, 'w+', encoding='utf8') as f:
for word, variants in word2list.items():
f.write('{}\n'.format(word.upper()))
f.write(', '.join(['{} ({})'.format(*c)
for c in Counter(variants).most_common()]) + '\n\n')
for noun in nouns:
f.write('{}\n'.format(noun.upper()))
f.write(', '.join(['{} ({})'.format(*c)
for c in Counter(noun2list[noun]).most_common()]) + '\n\n')
f.write('INFINITIVES\n')
for entry, count in Counter(infinitives).most_common(30):
f.write('{} ({}): '.format(entry, count))
f.write(', '.join(['{} ({})'.format(*c)
for c in Counter(
infinitive_realizations[entry]).most_common(5)]))
f.write('\n')