-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathadd-tags.cc
89 lines (87 loc) · 3.22 KB
/
add-tags.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright 2012 Aix-Marseille Univ.
// Author: benoit.favre@lif.univ-mrs.fr (Benoit Favre)
#include <map>
#include <vector>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <fst/fstlib.h>
int main(int argc, char** argv) {
if(argc != 2) {
std::cerr << "usage: " << argv[0] << " <dict>\n";
return 1;
}
std::map<std::string, int> words;
std::map<std::string, int> tags;
std::vector<std::string> reverse_tags;
tags["np"] = 1;
reverse_tags.push_back("np");
std::vector<std::vector<int> > tags_for_word;
std::ifstream input(argv[1]);
std::string line;
while(!input.eof()) {
std::getline(input, line);
std::istringstream tokenizer(line);
std::string word, tag;
int length;
//tokenizer >> length;
tokenizer >> word;
std::vector<int> word_tags;
while(tokenizer >> tag) {
//std::cerr << word << " " << tag << "\n";
std::map<std::string, int>::const_iterator found = tags.find(tag);
if(found != tags.end()) {
word_tags.push_back(found->second);
} else {
word_tags.push_back(tags.size() + 1);
tags[tag] = tags.size();
reverse_tags.push_back(tag);
}
}
words[word] = words.size();
//std::cerr << words[word] << "\n";
tags_for_word.push_back(word_tags);
}
std::string word;
fst::StdVectorFst automaton;
automaton.AddState();
fst::SymbolTable isyms("input");
fst::SymbolTable osyms("output");
isyms.AddSymbol("<eps>");
osyms.AddSymbol("<eps>");
while(!std::cin.eof()) {
if(!(std::cin >> word)) break;
std::map<std::string, int>::const_iterator found = words.find(word);
automaton.AddState();
if(found == words.end()) {
words[word] = words.size();
tags_for_word.push_back(std::vector<int>());
tags_for_word.back().push_back(tags["np"]);
}
//std::cerr << words[word] << "\n";
for(std::vector<int>::const_iterator tag = tags_for_word[words[word] - 1].begin(); tag != tags_for_word[words[word] - 1].end(); tag++) {
//std::cerr << *tag << "\n";
int64 word_symbol = isyms.AddSymbol(word);
int64 tag_symbol = osyms.AddSymbol(reverse_tags[*tag - 1]);
automaton.AddArc(automaton.NumStates() - 2, fst::StdArc(word_symbol, tag_symbol, 0, automaton.NumStates() - 1));
}
}
automaton.SetFinal(automaton.NumStates() - 1, 0);
automaton.SetInputSymbols(&isyms);
automaton.SetOutputSymbols(&osyms);
automaton.SetStart(0);
automaton.Write("");
}