use of zemberek.core.io.LineIterator in project zemberek-nlp by ahmetaa.
the class DisambiguatorPreprocessor method processOflazerAnalyzerOutputSak.
public void processOflazerAnalyzerOutputSak(File oflazerResult, File out) throws IOException {
SimpleTextWriter sakFile = SimpleTextWriter.keepOpenUTF8Writer(out);
LineIterator li = new SimpleTextReader(oflazerResult, "UTF-8").getLineIterator();
boolean sentenceStarted = false;
List<String> parses = Lists.newArrayList();
while (li.hasNext()) {
String line = li.next().trim();
String word = Strings.subStringUntilFirst(line, "\t").trim();
if (line.length() == 0 && !sentenceStarted) {
continue;
}
if (line.length() == 0 && parses.size() > 0) {
sakFile.writeLine(processParses(parses));
parses = Lists.newArrayList();
}
if (line.length() > 0) {
if (parses.size() == 0) {
if (!sentenceStarted) {
sakFile.writeLine("<S>");
}
sentenceStarted = true;
}
if (punctuations.contains(word)) {
// because analyser i use does not parse punctuations. i do it myself.
parses.add(word + "\t" + word + "\t+Punc");
} else if (!line.endsWith("?")) {
parses.add(line);
} else if (!word.equals("#")) {
String inferred = inferUnknownWordParse(word);
System.out.println("Bad word: [" + line + "] inferred to [" + inferred + "]");
parses.add(inferred);
}
}
if (word.equals("#")) {
sentenceStarted = false;
sakFile.writeLine("#\t#\t+Punc");
sakFile.writeLine("</S>");
parses = new ArrayList<>();
}
}
sakFile.close();
}
use of zemberek.core.io.LineIterator in project zemberek-nlp by ahmetaa.
the class AmbiguityStats method readAll.
public List<String> readAll(String filename) throws IOException {
List<String> lines = new ArrayList<>();
File file = new File(filename);
LineIterator it = SimpleTextReader.trimmingUTF8Reader(file).getLineIterator();
while (it.hasNext()) {
String quotesHyphensNormalzied = TextUtil.normalizeQuotesHyphens(it.next());
lines.add(Joiner.on(" ").join(lexer.tokenizeToStrings(quotesHyphensNormalzied)));
}
return lines;
}
use of zemberek.core.io.LineIterator in project zemberek-nlp by ahmetaa.
the class SmoothLmTest method testActualData.
@Test
@Ignore
public void testActualData() throws IOException {
Stopwatch sw = Stopwatch.createStarted();
File lmFile = new File("/home/ahmetaa/data/lm/smoothnlp-test/lm1.slm");
File tmp = new File("/tmp");
if (!lmFile.exists()) {
final File arpaFile = new File("/home/ahmetaa/data/lm/smoothnlp-test/lm1.arpa");
UncompressedToSmoothLmConverter converter = new UncompressedToSmoothLmConverter(lmFile, tmp);
converter.convertLarge(MultiFileUncompressedLm.generate(arpaFile, tmp, "utf-8", 4).dir, new UncompressedToSmoothLmConverter.NgramDataBlock(2, 1, 1), 20);
}
SmoothLm lm = SmoothLm.builder(lmFile).build();
System.out.println(sw.elapsed(TimeUnit.MILLISECONDS));
sw.reset();
final int order = 3;
final int gramCount = 1000000;
int[][] ids = new int[gramCount][order];
long[] trigrams = new long[gramCount];
LineIterator li = SimpleTextReader.trimmingUTF8LineIterator(new File("/home/ahmetaa/data/lm/smoothnlp-test/corpus-lowercase_1000000_2000000"));
SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
int i = 0;
while (i < gramCount) {
String line = li.next();
String[] tokens = tokenizer.split(line);
if (tokens.length < order) {
continue;
}
for (int j = 0; j < tokens.length - order - 1; j++) {
String[] words = new String[order];
System.arraycopy(tokens, j, words, 0, order);
int[] indexes = lm.getVocabulary().toIndexes(words);
if (!lm.getVocabulary().containsAll(indexes)) {
continue;
}
ids[i] = indexes;
if (order == 3) {
trigrams[i] = lm.getVocabulary().encodeTrigram(indexes);
}
i++;
if (i == gramCount) {
break;
}
}
}
sw.start();
double tr = 0;
for (int[] id : ids) {
tr += lm.getProbability(id);
}
System.out.println(sw.elapsed(TimeUnit.MILLISECONDS));
System.out.println("tr = " + tr);
}
use of zemberek.core.io.LineIterator in project zemberek-nlp by ahmetaa.
the class AmbiguityStats method readAll.
public List<String> readAll(String filename) throws IOException {
List<String> lines = new ArrayList<>();
File file = new File(filename);
LineIterator it = SimpleTextReader.trimmingUTF8Reader(file).getLineIterator();
while (it.hasNext()) {
String quotesHyphensNormalzied = TextUtil.normalizeQuotesHyphens(it.next());
lines.add(Joiner.on(" ").join(lexer.tokenizeToStrings(quotesHyphensNormalzied)));
}
return lines;
}
use of zemberek.core.io.LineIterator in project zemberek-nlp by ahmetaa.
the class DisambiguatorPreprocessor method processOflazerAnalyzerOutputYuret.
public void processOflazerAnalyzerOutputYuret(File oflazerResult, File out) throws IOException {
SimpleTextWriter yuretFileWriter = SimpleTextWriter.keepOpenWriter(new FileOutputStream(out), "ISO-8859-9");
yuretFileWriter.writeLine("<DOC>\t<DOC>");
yuretFileWriter.writeLine();
LineIterator li = new SimpleTextReader(oflazerResult, "UTF-8").getLineIterator();
boolean sentenceStarted = false;
List<String> parses = new ArrayList<>();
while (li.hasNext()) {
String line = li.next().trim().replaceAll("AorPart", "PresPart");
String word = Strings.subStringUntilFirst(line, "\t").trim();
if (line.length() == 0 && !sentenceStarted) {
continue;
}
if (line.length() == 0 && parses.size() > 0) {
yuretFileWriter.writeLines(parses);
yuretFileWriter.writeLine();
yuretFileWriter.writeLine();
parses = Lists.newArrayList();
}
if (line.length() > 0) {
if (parses.size() == 0) {
if (!sentenceStarted) {
yuretFileWriter.writeLine("<S>\t<S>");
yuretFileWriter.writeLine();
}
sentenceStarted = true;
}
if (punctuations.contains(word)) {
// because analyser i use does not parse punctuations. i do it myself.
parses.add(word + "\t" + word + "\t+Punc");
} else if (!line.endsWith("?")) {
parses.add(line);
} else if (!word.equals("#")) {
String inferred = inferUnknownWordParse(word);
System.out.println("Bad word: [" + line + "] inferred to [" + inferred + "]");
parses.add(inferred);
}
}
if (word.equals("#")) {
sentenceStarted = false;
yuretFileWriter.writeLine("</S>\t</S>\n");
parses = new ArrayList<>();
}
}
yuretFileWriter.writeLine("</DOC>\t</DOC>");
yuretFileWriter.close();
}
Aggregations