Friday, August 26, 2011

stupid-unknown-extractor

This is a small utility for extracting unknowns from a pair of tagged streams. It's based on that idea that, if each sentence has a single unknown word, then those words are likely to be translations of each other. It's stupid because it does nothing if there is more than one unknown.

Usage:
$ cat en.txt | apertium -d [path-to]/apertium-en-es/ en-es-tagger > en.tagged
$ cat es.txt | apertium -d [path-to]/apertium-en-es/ es-en-tagger > es.tagged
$ stupid-unknown-extractor en.tagged es.tagged

Sample output:

buckler	adarga	old<adj><sint> *buckler ,<cm> :: *adarga antiguo<adj><f><sg>
homespun	velludo	good<adj><sint><sup> *homespun .<sent> :: de<pr> *velludo para<pr>
gaunt	recia	*gaunt -<guio> :: complexión<n><f><sg> *recia ,<cm>

Compile:

g++ stupid_unknown_extractor.cc -o stupid-unknown-extractor

Code:

#include <iostream>
#include <string>
#include <cstdio>
#include <list>
#include <vector>

using namespace std;

//Set to true to also split at <cm>
bool split_cm = true;

inline bool
is_sent(wstring &in)
{
return ((in.size() > 6) && (in.compare(in.size()-6, 6, L"<sent>") == 0));
}

inline bool
is_cm(wstring &in)
{
return ((in.size() > 4) && (in.compare(in.size()-4, 4, L"<cm>") == 0));
}

inline bool
is_split (wstring &in)
{
if (split_cm)
return (is_sent(in) || is_cm(in));
else
return (is_sent(in));
}

wstring
read_word(FILE *input)
{
wstring out = L"";
wchar_t c;
bool inword = false;

while(!feof(input))
{
c = static_cast<wchar_t>(fgetwc(input));
if (!inword)
{
if (c == L'^')
{
inword = true;
}
if (c == L'\\')
{
c = static_cast<wchar_t>(fgetwc(input));
}
}
else
{
if (c == L'$')
{
return out;
}
if(c == L'\\')
{
out += L'\\';
c = static_cast<wchar_t>(fgetwc(input));
out += c;
}
else
{
out += c;
}
}
}

return L"";
}

void usage()
{
wcout << L"usage: stupid-unknown-extractor file1 file2 [output]" << endl;
}

bool
read_sentence (FILE* file, vector<wstring> &tokens)
{
wstring word;
while (!feof(file))
{
word = read_word(file);
tokens.push_back(word);
if (is_split(word))
{
return true;
}
}
return false;
}

vector<int>
unknown_indices(vector<wstring> sentence)
{
vector<int> index;
vector<wstring>::iterator it;
int count = 0;

for (it=sentence.begin(); it < sentence.end(); it++)
{
if ((*it)[0] == L'*')
{
index.push_back(count);
}
count++;
}
return index;
}

void
print_context(FILE* out, vector<wstring> &sent, int index)
{
wstring tmp;
if (index >= 1)
{
fputws(sent[index - 1].c_str(), out);
fputwc(L' ', out);
}
fputws(sent[index].c_str(), out);
fputwc(L' ', out);
fputws(sent[index + 1].c_str(), out);
}

void
print_unk(FILE* out, vector<wstring> &sent, int index)
{
if (sent.at(index)[0] != L'*')
{
wcerr << L"Error with unknown: " << sent.at(index) << endl;
}
fputws(sent.at(index).substr(1, sent.at(index).length() - 1).c_str(), out);
}

void
try_output(FILE* out, vector<wstring> &left, vector<wstring> &right)
{
vector<int> lvec;
vector<int> rvec;
lvec = unknown_indices(left);
rvec = unknown_indices(right);

if ((lvec.size() == 1) && (rvec.size() == 1))
{
print_unk(out, left, lvec.at(0));
fputwc(L'\t', out);
print_unk(out, right, rvec.at(0));
fputwc(L'\t', out);

// context
print_context(out, left, lvec.at(0));
fputws(L" :: ", out);
print_context(out, right, rvec.at(0));
fputwc(L'\n', out);
}
}

int main (int argc, char** argv)
{
FILE* left;
FILE* right;
FILE* out;

if (argc < 3 || argc > 4)
{
usage();
exit(1);
}
if (argc == 3)
{
out = stdout;
}
else
{
out = fopen(argv[3], "wb");
}

left = fopen(argv[1], "rb");
right = fopen(argv[2], "rb");

vector<wstring> sentl;
vector<wstring> sentr;

while (read_sentence(left, sentl) && read_sentence(right, sentr))
{
try_output(out, sentl, sentr);
sentl.clear();
sentr.clear();
}

fclose(left);
fclose(right);
fclose(out);
exit(0);
}