use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method parseLargeVocabularyZemberekForMorfessor.
@Test
@Ignore("Not a Test.")
public void parseLargeVocabularyZemberekForMorfessor() throws IOException {
Path wordFreqFile = DATA_PATH.resolve("vocab.all.freq");
Path outDir = DATA_PATH.resolve("out");
Files.createDirectories(outDir);
TurkishMorphology parser = TurkishMorphology.createWithDefaults();
Log.info("Loading histogram.");
Histogram<String> histogram = Histogram.loadFromUtf8File(wordFreqFile, ' ');
histogram.removeSmaller(1000);
List<String> accepted = new ArrayList<>(histogram.size());
int c = 0;
for (String s : histogram) {
s = s.trim();
if (s.length() < 4) {
continue;
}
List<WordAnalysis> parses = parser.analyze(s);
if (parses.size() > 0 && parses.get(0).dictionaryItem.primaryPos != PrimaryPos.Unknown) {
LinkedHashSet<String> k = new LinkedHashSet<>(2);
for (WordAnalysis parse : parses) {
if (parse.dictionaryItem.lemma.length() > 1) {
String str = parse.root + " " + String.join(" ", parse.suffixSurfaceList()).replaceAll("[ ]+", " ").trim();
k.add(str);
}
}
String join = String.join(", ", k).trim();
if (!s.equals(join) && join.length() > 2) {
accepted.add(s + " " + join);
}
}
if (c > 0 && c % 10000 == 0) {
Log.info("Processed = " + c);
}
c++;
}
sortAndSave(outDir.resolve("morfessor-annotation.txt"), accepted);
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method generatorTest.
@Test
@Ignore("Not a Test.")
public void generatorTest() throws IOException {
TurkishMorphology parser = TurkishMorphology.createWithDefaults();
List<WordAnalysis> result = parser.analyze("besiciliği");
WordAnalysis first = result.get(0);
Log.info(first.inflectionalGroups);
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class AutomaticLabelingExperiment method processContent.
public String processContent(TurkishSentenceAnalyzer analyzer, String content, boolean useRoots) {
List<Token> docTokens = lexer.tokenize(content);
List<String> reduced = new ArrayList<>(docTokens.size());
for (Token token : docTokens) {
if (token.getType() == TurkishLexer.PercentNumeral || token.getType() == TurkishLexer.Number || token.getType() == TurkishLexer.Punctuation || token.getType() == TurkishLexer.RomanNumeral || token.getType() == TurkishLexer.Time || token.getType() == TurkishLexer.UnknownWord || token.getType() == TurkishLexer.Unknown) {
continue;
}
String tokenStr = token.getText();
reduced.add(tokenStr);
}
String joined = String.join(" ", reduced);
if (useRoots) {
SentenceAnalysis analysis = analyzer.analyze(joined);
analyzer.disambiguate(analysis);
List<String> res = new ArrayList<>();
for (SentenceAnalysis.Entry e : analysis) {
WordAnalysis best = e.parses.get(0);
if (best.isUnknown()) {
res.add(e.input);
continue;
}
List<String> lemmas = best.getLemmas();
if (lemmas.size() == 0) {
continue;
}
res.add(lemmas.get(lemmas.size() - 1));
}
joined = String.join(" ", res);
}
return joined.replaceAll("[']", "").toLowerCase(Turkish.LOCALE);
}
use of zemberek.morphology.analysis.WordAnalysis in project lucene-solr-analysis-turkish by iorixxx.
the class Zemberek3StemFilterFactory method parse.
private static void parse(String word, TurkishMorphology morphology) {
List<WordAnalysis> results = morphology.analyze(word);
System.out.println("Word = " + word + " has " + results.size() + " many solutions");
if (results.size() == 0)
return;
System.out.println("Parses: ");
for (WordAnalysis result : results) {
System.out.println("number of morphemes = " + result.inflectionalGroups.size());
System.out.println(result.formatLong());
System.out.println("\tStems = " + result.getStems());
System.out.println("\tLemmas = " + result.getLemmas());
System.out.println("\tLemma = " + result.getLemma());
System.out.println("\tRoot = " + result.getRoot());
System.out.println("\tRoot = " + result.dictionaryItem.root);
System.out.println("\tStemAndEnding = " + result.getStemAndEnding());
System.out.println("-------------------");
}
System.out.println("final selected stem : " + Zemberek3StemFilter.stem(results, "maxLength"));
System.out.println("==================================");
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class MorphologyConsole method run.
@Override
public void run() {
Builder b = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault());
if (disableUnknownAnalysis) {
b.disableUnidentifiedTokenAnalyzer();
}
if (enableInformalWordAnalysis) {
b.useInformalAnalysis();
}
TurkishMorphology morphology = b.build();
String input;
System.out.println("Enter word or sentence. Type `quit` or `Ctrl+C` to exit.:");
Scanner sc = new Scanner(System.in);
input = sc.nextLine();
while (!input.equals("quit")) {
if (input.trim().length() == 0) {
System.out.println("Empty line cannot be processed.");
input = sc.nextLine();
continue;
}
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(input);
System.out.format("%nS:%s%n", input);
for (SentenceWordAnalysis sw : analysis) {
WordAnalysis wa = sw.getWordAnalysis();
System.out.println(wa.getInput());
SingleAnalysis best = sw.getBestAnalysis();
for (SingleAnalysis singleAnalysis : wa) {
boolean isBest = singleAnalysis.equals(best);
if (wa.analysisCount() == 1) {
System.out.println(singleAnalysis.formatLong());
} else {
System.out.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
}
}
}
System.out.println();
input = sc.nextLine();
}
}
Aggregations