use of zemberek.morphology.old_ambiguity.AbstractDisambiguator.SentenceData in project zemberek-nlp by ahmetaa.
the class DataConverter method extract.
private static void extract(Path dataPath, Path output) throws IOException {
DataSet set = com.google.common.io.Files.asCharSource(dataPath.toFile(), Charsets.UTF_8).readLines(new DataSetLoader());
TurkishMorphology morphology = TurkishMorphology.create(RootLexicon.builder().addTextDictionaryResources("tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict", "tr/person-names.dict").build());
List<SentenceAnalysis> result = new ArrayList<>();
Histogram<String> parseFails = new Histogram<>();
for (SentenceData sentenceData : set) {
// System.out.println(sentenceData.correctParse);
List<String> tokens = Splitter.on(" ").splitToList(sentenceData.sentence());
if (tokens.size() == 0 || tokens.size() != sentenceData.correctParse.size()) {
continue;
}
List<SentenceWordAnalysis> correctList = new ArrayList<>();
for (int i = 0; i < tokens.size(); i++) {
String s = tokens.get(i);
String p = sentenceData.correctParse.get(i);
p = p.replaceAll("PCNom", "PCNOM");
p = p.replaceAll("Pnon|Nom", "");
p = p.replaceAll("\\+Pos\\+", "+");
p = p.replaceAll("\\+Pos\\^DB", "^DB");
p = p.replaceAll("[+]+", "+");
p = p.replaceAll("[+]$", "");
p = p.replaceAll("[+]\\^DB", "^DB");
p = p.replaceAll("[.]", "");
p = p.toLowerCase(Turkish.LOCALE);
p = p.replaceAll("adverb", "adv");
p = p.replaceAll("\\+cop\\+a3sg", "+a3sg+cop");
p = p.replaceAll("\\+Unable", "^DB+Verb+Able+Neg");
if (lookup.containsKey(p)) {
p = lookup.get(p);
}
WordAnalysis a = morphology.analyze(s);
if (!a.isCorrect()) {
break;
}
SingleAnalysis best = null;
for (SingleAnalysis analysis : a) {
String of = convert(analysis);
if (of.equals(p)) {
best = analysis;
break;
}
}
if (best == null) {
if (Character.isUpperCase(s.charAt(0)) && (p.contains("+noun") && !p.contains("prop"))) {
String pp = p.replaceFirst("\\+noun", "\\+noun+prop");
for (SingleAnalysis analysis : a) {
String of = convert(analysis);
if (of.equals(pp)) {
best = analysis;
break;
}
}
}
}
if (best == null) {
List<String> z = a.getAnalysisResults().stream().map(DataConverter::convert).collect(Collectors.toList());
parseFails.add(s + " " + p);
} else {
correctList.add(new SentenceWordAnalysis(best, a));
}
}
if (correctList.size() == tokens.size()) {
result.add(new SentenceAnalysis(sentenceData.sentence(), correctList));
}
}
Scripts.saveUnambiguous(result, output);
parseFails.removeSmaller(3);
parseFails.saveSortedByCounts(Paths.get("parse-fails.txt"), " ");
System.out.format("Full Sentence Match = %d in %d%n", result.size(), set.sentences.size());
}
Aggregations