use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class AmbiguityStats method ambiguousGroupStats.
public void ambiguousGroupStats(String filename) throws IOException {
List<String> lines = readAll(filename);
Histogram<String> uniques = new Histogram<>(1000000);
Map<String, Histogram<String>> ambiguityGroups = Maps.newHashMap();
int total = 0;
for (String line : lines) {
for (String s : splitter.split(line)) {
WordAnalysis results = parser.analyze(s);
if (++total % 50000 == 0) {
System.out.println("Processed: " + total);
}
if (results.analysisCount() > 1) {
String key = generateKeyFromParse(results);
uniques.add(key);
Histogram<String> members = ambiguityGroups.get(key);
if (members == null) {
members = new Histogram<>();
ambiguityGroups.put(key, members);
}
members.add(s);
}
}
}
System.out.println("Total: " + total);
Stats st = new Stats(0.1);
st.allCounts = (int) uniques.totalCount();
st.allUniques = uniques.size();
for (String s : uniques.getSortedList()) {
int count = uniques.getCount(s);
if (st.overCutoff(count)) {
String p1 = percentStr(count, st.allCounts);
st.significantCounts += count;
st.significantUniques++;
System.out.println(s + " : " + count + " " + pp(p1));
Histogram<String> members = ambiguityGroups.get(s);
for (String member : members.getSortedList()) {
int memberCount = members.getCount(member);
if (pct(memberCount, count) > 0.1) {
System.out.println(member + " : " + members.getCount(member));
}
}
System.out.println();
}
}
st.dump();
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class AmbiguityStats method noParse.
public void noParse(String... filename) throws IOException {
Histogram<String> uniques = new Histogram<>(1000000);
int total = 0;
for (String file : filename) {
List<String> lines = readAll(file);
Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();
for (String line : lines) {
for (String s : splitter.split(line)) {
WordAnalysis results = parser.analyze(s);
total++;
if (total % 50000 == 0) {
System.out.println("Processed: " + total);
}
if (results.analysisCount() == 0) {
uniques.add(s);
}
}
}
System.out.println("Total: " + total);
}
Stats st = new Stats(0.0002);
st.allCounts = (int) uniques.totalCount();
st.allUniques = uniques.size();
for (String s : uniques.getSortedList()) {
int count = uniques.getCount(s);
if (count > 5) {
st.significantCounts += count;
st.significantUniques++;
System.out.println(s + " : " + count);
}
}
st.dump();
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class DisambiguateSentences method main.
public static void main(String[] args) {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
String sentence = "Bol baharatlı bir yemek yaptıralım.";
Log.info("Sentence = " + sentence);
List<WordAnalysis> analyses = morphology.analyzeSentence(sentence);
Log.info("Sentence word analysis result:");
for (WordAnalysis entry : analyses) {
Log.info("Word = " + entry.getInput());
for (SingleAnalysis analysis : entry) {
Log.info(analysis.formatLong());
}
}
SentenceAnalysis result = morphology.disambiguate(sentence, analyses);
Log.info("\nAfter ambiguity resolution : ");
result.bestAnalysis().forEach(Log::info);
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class HunspellOperations method generateAnnotationFileMultiSplit.
private static void generateAnnotationFileMultiSplit(Path vocab, Path annotationsPath) throws IOException {
List<String> words = Files.readAllLines(vocab, StandardCharsets.UTF_8);
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
List<String> annotations = new ArrayList<>();
for (String word : words) {
WordAnalysis analysis = morphology.analyze(word);
if (!analysis.isCorrect()) {
Log.warn("Cannot analyze %s", word);
continue;
}
LinkedHashSet<String> stemEndings = new LinkedHashSet<>();
for (SingleAnalysis s : analysis) {
if (s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun || s.getDictionaryItem().secondaryPos == SecondaryPos.Abbreviation) {
continue;
}
String surfaces = AnalysisFormatters.SURFACE_SEQUENCE.format(s);
List<String> tokens = Splitter.on(" ").splitToList(surfaces);
String stem = tokens.get(0);
for (int i = 0; i < tokens.size(); i++) {
String morpheme = tokens.get(i);
if (i > 0) {
stem = stem + morpheme;
}
List<String> morphemes = i == tokens.size() - 1 ? new ArrayList<>() : tokens.subList(i + 1, tokens.size());
String ending = String.join(" ", morphemes);
if (isCorrectAndContainsNoProper(morphology.analyze(stem))) {
if (ending.length() > 0) {
stemEndings.add(word + " " + stem + " " + ending);
}
/*else {
stemEndings.add(word + " " + stem);
}*/
}
}
}
annotations.add(String.join(",", stemEndings));
}
Files.write(annotationsPath, annotations, StandardCharsets.UTF_8);
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class StemDisambiguationExperiment method unambiguous.
private boolean unambiguous(String sentence) {
for (String token : TurkishTokenizer.DEFAULT.tokenizeToStrings(sentence)) {
WordAnalysis analyses = morphology.analyze(token);
Set<String> lemmas = new HashSet<>();
for (SingleAnalysis analysis : analyses) {
lemmas.add(analysis.getDictionaryItem().normalizedLemma());
}
if (lemmas.size() > 1) {
return false;
}
}
return true;
}
Aggregations