use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class AmbiguityStats method noParse.
public void noParse(String... filename) throws IOException {
Histogram<String> uniques = new Histogram<>(1000000);
int total = 0;
for (String file : filename) {
List<String> lines = readAll(file);
Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();
for (String line : lines) {
for (String s : splitter.split(line)) {
List<WordAnalysis> results = parser.getWordAnalyzer().analyze(TurkishAlphabet.INSTANCE.normalize(s));
total++;
if (total % 50000 == 0) {
System.out.println("Processed: " + total);
}
if (results.size() == 0) {
uniques.add(s);
}
}
}
System.out.println("Total: " + total);
}
Stats st = new Stats(0.0002);
st.allCounts = (int) uniques.totalCount();
st.allUniques = uniques.size();
for (String s : uniques.getSortedList()) {
int count = uniques.getCount(s);
if (count > 5) {
st.significantCounts += count;
st.significantUniques++;
System.out.println(s + " : " + count);
}
}
st.dump();
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class Z3MarkovModelDisambiguator method disambiguate.
@Override
public void disambiguate(SentenceAnalysis sentenceParse) {
Ambiguous[] ambiguousSeq = getAmbiguousSequence(sentenceParse);
int[] bestSequence = bestSequence(ambiguousSeq);
for (int i = 0; i < bestSequence.length; i++) {
List<WordAnalysis> results = sentenceParse.getParses(i);
if (results.size() == 1) {
continue;
}
WordAnalysis tmp = results.get(0);
results.set(0, results.get(bestSequence[i]));
results.set(bestSequence[i], tmp);
}
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class Z3MarkovModelDisambiguator method getAmbiguousSequence.
public Ambiguous[] getAmbiguousSequence(SentenceAnalysis sentence) {
Ambiguous[] awords = new Ambiguous[sentence.size() + 3];
awords[0] = startWord;
awords[1] = startWord;
int i = 2;
for (SentenceAnalysis.Entry entry : sentence) {
int[] roots = new int[entry.parses.size()];
int[] lastIgs = new int[entry.parses.size()];
int j = 0;
for (WordAnalysis parse : entry.parses) {
String rootPart = parse.dictionaryItem.lemma;
WordAnalysis.InflectionalGroup firstIg = parse.inflectionalGroups.get(0);
if (firstIg.suffixList.size() == 0) {
rootPart += firstIg.formatNoSurface();
} else {
String s = firstIg.formatNoSurface();
String suffixPart = Strings.subStringAfterFirst(s, ";");
if (suffixPart.equals("A3sg+Pnon+Nom)")) {
rootPart += (Strings.subStringUntilFirst(s, ";") + ")");
}
}
roots[j] = rootLm.getVocabulary().indexOf(rootPart);
String igPart;
int igSize = parse.inflectionalGroups.size();
if (igSize > 1 && parse.inflectionalGroups.get(igSize - 2).suffixList.size() == 0) {
igPart = parse.inflectionalGroups.get(igSize - 2).formatNoSurface() + parse.getLastIg();
} else {
igPart = parse.getLastIg().formatNoSurface();
}
lastIgs[j] = igLm.getVocabulary().indexOf(igPart);
j++;
}
awords[i] = new Ambiguous(roots, lastIgs);
i++;
}
awords[i] = endWord;
return awords;
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class Z3ModelA method disambiguate.
@Override
public void disambiguate(SentenceAnalysis sentenceParse) {
Ambiguous[] ambiguousSeq = getAmbiguousSequence(sentenceParse);
int[] bestSequence = bestSequence(ambiguousSeq);
for (int i = 0; i < bestSequence.length; i++) {
List<WordAnalysis> results = sentenceParse.getParses(i);
if (results.size() == 1) {
continue;
}
WordAnalysis tmp = results.get(0);
results.set(0, results.get(bestSequence[i]));
results.set(bestSequence[i], tmp);
}
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class Z3ModelA method getAmbiguousSequence.
public Ambiguous[] getAmbiguousSequence(SentenceAnalysis sentence) {
Ambiguous[] awords = new Ambiguous[sentence.size() + 3];
awords[0] = startWord;
awords[1] = startWord;
int i = 2;
for (SentenceAnalysis.Entry entry : sentence) {
int[] roots = new int[entry.parses.size()];
int[][] igs = new int[entry.parses.size()][];
int j = 0;
for (WordAnalysis parse : entry.parses) {
String rootPart = parse.dictionaryItem.lemma;
roots[j] = rootLm.getVocabulary().indexOf(rootPart);
igs[j] = new int[parse.inflectionalGroups.size()];
for (int k = 0; j < parse.inflectionalGroups.size(); k++) {
igs[j][k] = igLm.getVocabulary().indexOf(parse.inflectionalGroups.get(k).formatNoSurface());
}
j++;
}
awords[i] = new Ambiguous(roots, igs);
i++;
}
awords[i] = endWord;
return awords;
}
Aggregations