This is a small utility for extracting unknowns from a pair of tagged streams. It's based on that idea that, if each sentence has a single unknown word, then those words are likely to be translations of each other. It's stupid because it does nothing if there is more than one unknown.
$ cat en.txt | apertium -d [path-to]/apertium-en-es/ en-es-tagger > en.tagged $ cat es.txt | apertium -d [path-to]/apertium-en-es/ es-en-tagger > es.tagged $ stupid-unknown-extractor en.tagged es.tagged
Sample output:
buckler adarga old<adj><sint> *buckler ,<cm> :: *adarga antiguo<adj><f><sg> homespun velludo good<adj><sint><sup> *homespun .<sent> :: de<pr> *velludo para<pr> gaunt recia *gaunt -<guio> :: complexión<n><f><sg> *recia ,<cm>
Compile:
g++ stupid_unknown_extractor.cc -o stupid-unknown-extractor
Code:
#include <iostream> #include <string> #include <cstdio> #include <list> #include <vector> using namespace std; //Set to true to also split at <cm> bool split_cm = true; inline bool is_sent(wstring &in) { return ((in.size() > 6) && (in.compare(in.size()-6, 6, L"<sent>") == 0)); } inline bool is_cm(wstring &in) { return ((in.size() > 4) && (in.compare(in.size()-4, 4, L"<cm>") == 0)); } inline bool is_split (wstring &in) { if (split_cm) return (is_sent(in) || is_cm(in)); else return (is_sent(in)); } wstring read_word(FILE *input) { wstring out = L""; wchar_t c; bool inword = false; while(!feof(input)) { c = static_cast<wchar_t>(fgetwc(input)); if (!inword) { if (c == L'^') { inword = true; } if (c == L'\\') { c = static_cast<wchar_t>(fgetwc(input)); } } else { if (c == L'$') { return out; } if(c == L'\\') { out += L'\\'; c = static_cast<wchar_t>(fgetwc(input)); out += c; } else { out += c; } } } return L""; } void usage() { wcout << L"usage: stupid-unknown-extractor file1 file2 [output]" << endl; } bool read_sentence (FILE* file, vector<wstring> &tokens) { wstring word; while (!feof(file)) { word = read_word(file); tokens.push_back(word); if (is_split(word)) { return true; } } return false; } vector<int> unknown_indices(vector<wstring> sentence) { vector<int> index; vector<wstring>::iterator it; int count = 0; for (it=sentence.begin(); it < sentence.end(); it++) { if ((*it)[0] == L'*') { index.push_back(count); } count++; } return index; } void print_context(FILE* out, vector<wstring> &sent, int index) { wstring tmp; if (index >= 1) { fputws(sent[index - 1].c_str(), out); fputwc(L' ', out); } fputws(sent[index].c_str(), out); fputwc(L' ', out); fputws(sent[index + 1].c_str(), out); } void print_unk(FILE* out, vector<wstring> &sent, int index) { if (sent.at(index)[0] != L'*') { wcerr << L"Error with unknown: " << sent.at(index) << endl; } fputws(sent.at(index).substr(1, sent.at(index).length() - 1).c_str(), out); } void try_output(FILE* out, vector<wstring> &left, vector<wstring> &right) { vector<int> lvec; vector<int> rvec; lvec = unknown_indices(left); rvec = unknown_indices(right); if ((lvec.size() == 1) && (rvec.size() == 1)) { print_unk(out, left, lvec.at(0)); fputwc(L'\t', out); print_unk(out, right, rvec.at(0)); fputwc(L'\t', out); // context print_context(out, left, lvec.at(0)); fputws(L" :: ", out); print_context(out, right, rvec.at(0)); fputwc(L'\n', out); } } int main (int argc, char** argv) { FILE* left; FILE* right; FILE* out; if (argc < 3 || argc > 4) { usage(); exit(1); } if (argc == 3) { out = stdout; } else { out = fopen(argv[3], "wb"); } left = fopen(argv[1], "rb"); right = fopen(argv[2], "rb"); vector<wstring> sentl; vector<wstring> sentr; while (read_sentence(left, sentl) && read_sentence(right, sentr)) { try_output(out, sentl, sentr); sentl.clear(); sentr.clear(); } fclose(left); fclose(right); fclose(out); exit(0); }