use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishMorphologyFunctionalTests method testForeingLocale.
@Test
public void testForeingLocale() {
TurkishMorphology morphology = getMorphology("UNICEF [A:LocaleEn]");
WordAnalysis result = morphology.analyze("Unicefte");
Assert.assertEquals(1, result.analysisCount());
morphology = getMorphology("UNICEF");
result = morphology.analyze("Unicefte");
Assert.assertEquals(0, result.analysisCount());
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method performance.
@Test
@Ignore("Not a Test.")
public void performance() throws IOException {
List<String> lines = Files.readAllLines(// Paths.get("/media/depo/data/aaa/corpora/dunya.100k")
Paths.get("/home/ahmetaa/data/nlp/corpora/dunya.100k"));
TurkishMorphology analyzer = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().disableCache().build();
Log.info(lines.size() + " lines will be processed.");
Log.info("Dictionary has " + analyzer.getLexicon().size() + " items.");
long tokenCount = 0;
long tokenCountNoPunct = 0;
Stopwatch clock = Stopwatch.createStarted();
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
for (String line : lines) {
List<Token> tokens = lexer.tokenize(line);
tokenCount += tokens.stream().filter(s -> (s.getType() != Token.Type.SpaceTab)).count();
tokenCountNoPunct += tokens.stream().filter(s -> (s.getType() != Token.Type.Punctuation && s.getType() != Token.Type.SpaceTab)).count();
}
long elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Token Count = " + tokenCount);
Log.info("Token Count (No Punctuation) = " + tokenCountNoPunct);
Log.info("Tokenization Speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization Speed (No Punctuation) = %.1f tokens/sec ", tokenCountNoPunct * 1000d / elapsed);
Log.info("");
Log.info("Sentence word analysis test:");
int counter = 0;
clock.reset().start();
for (String line : lines) {
try {
List<WordAnalysis> res = analyzer.analyzeSentence(line);
// for preventing VM optimizations.
counter += res.size();
} catch (Exception e) {
Log.info(line);
e.printStackTrace();
}
}
elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Tokenization + Analysis speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization + Analysis speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
Log.info(analyzer.toString());
Log.info("");
Log.info("Disambiguation Test:");
analyzer.invalidateCache();
clock.reset().start();
for (String line : lines) {
try {
SentenceAnalysis results = analyzer.analyzeAndDisambiguate(line);
// for preventing VM optimizations.
counter += results.size();
} catch (Exception e) {
Log.info(line);
e.printStackTrace();
}
}
elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Tokenization + Analysis + Disambiguation speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization + Analysis + Disambiguation speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
Log.info(counter);
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method parseLargeVocabularyZemberek.
@Test
@Ignore("Not a Test.")
public void parseLargeVocabularyZemberek() throws IOException {
// Path wordFreqFile = DATA_PATH.resolve("vocab.all.freq");
Path wordFreqFile = DATA_PATH.resolve("all-counts-sorted-freq.txt");
Path outDir = DATA_PATH.resolve("out");
Files.createDirectories(outDir);
TurkishMorphology parser = TurkishMorphology.createWithDefaults();
Log.info("Loading histogram.");
Histogram<String> histogram = Histogram.loadFromUtf8File(wordFreqFile, ' ');
List<String> accepted = new ArrayList<>(histogram.size() / 3);
int c = 0;
for (String s : histogram) {
try {
WordAnalysis parses = parser.analyze(s);
List<SingleAnalysis> analyses = parses.getAnalysisResults();
if (analyses.size() > 0 && analyses.get(0).getDictionaryItem().primaryPos != PrimaryPos.Unknown) {
accepted.add(s);
}
if (c > 0 && c % 10000 == 0) {
Log.info("Processed = " + c);
}
c++;
} catch (Exception e) {
Log.info("Exception in %s", s);
}
}
save(outDir.resolve("zemberek-parsed-words.txt"), accepted);
sortAndSave(outDir.resolve("zemberek-parsed-words.tr.txt"), accepted);
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method guessRootsWithHeuristics.
@Test
@Ignore("Not a Test.")
public void guessRootsWithHeuristics() throws IOException {
Path wordFreqFile = DATA_PATH.resolve("out/no-parse-zemberek-freq.txt");
Log.info("Loading histogram.");
List<String> words = Files.readAllLines(wordFreqFile);
TurkishDictionaryLoader dictionaryLoader = new TurkishDictionaryLoader();
// dictionaryLoader.load("elma");
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("elma").disableCache().build();
Multimap<String, String> res = HashMultimap.create(100000, 3);
int c = 0;
for (String s : words) {
if (s.length() < 4) {
continue;
}
if (!TurkishAlphabet.INSTANCE.containsVowel(s)) {
continue;
}
for (int i = 2; i < s.length(); i++) {
String candidateRoot = s.substring(0, i + 1);
if (!TurkishAlphabet.INSTANCE.containsVowel(candidateRoot)) {
continue;
}
List<DictionaryItem> items = new ArrayList<>(3);
// assumes noun.
items.add(TurkishDictionaryLoader.loadFromString(candidateRoot));
// assumes noun.
items.add(TurkishDictionaryLoader.loadFromString(candidateRoot + " [P:Verb]"));
char last = candidateRoot.charAt(candidateRoot.length() - 1);
if (i < s.length() - 1) {
char next = s.charAt(candidateRoot.length());
if (Turkish.Alphabet.isVowel(next)) {
String f = "";
if (last == 'b') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'p';
} else if (last == 'c') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'ç';
} else if (last == 'ğ') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'k';
}
if (last == 'd') {
f = candidateRoot.substring(0, candidateRoot.length() - 1) + 't';
}
if (f.length() > 0) {
items.add(TurkishDictionaryLoader.loadFromString(f));
}
}
}
for (DictionaryItem item : items) {
morphology.getMorphotactics().getStemTransitions().addDictionaryItem(item);
WordAnalysis analyze = morphology.analyze(s);
for (SingleAnalysis wordAnalysis : analyze) {
if (!wordAnalysis.isUnknown()) {
res.put(candidateRoot, s);
}
}
morphology.getMorphotactics().getStemTransitions().removeDictionaryItem(item);
}
}
if (++c % 10000 == 0) {
Log.info(c);
}
if (c == 100000) {
break;
}
}
Log.info("Writing.");
try (PrintWriter pw1 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-words").toFile());
PrintWriter pw2 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-vocabulary").toFile())) {
for (String root : res.keySet()) {
Collection<String> vals = res.get(root);
if (vals.size() < 2) {
continue;
}
List<String> wl = new ArrayList<>(vals);
wl.sort(turkishCollator::compare);
pw1.println(root + " : " + String.join(", ", vals));
pw2.println(root);
}
}
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class LoadProperNouns method main.
public static void main(String[] args) throws IOException {
TurkishMorphology parserGenerator = TurkishMorphology.createWithDefaults();
List<String> lines = Files.readAllLines(Paths.get("/home/afsina/Downloads/documents-export-2016-02-17/vocabulary-proper-full.tr.txt"));
Histogram<String> histogram = new Histogram<>();
Set<String> ignore = new HashSet<>(Files.readAllLines(Paths.get("morphology/src/main/resources/tr/proper-ignore")));
for (String line : lines) {
if (line.startsWith("_")) {
continue;
}
line = line.trim();
if (line.length() == 0) {
continue;
}
String word = Strings.subStringUntilFirst(line, " ");
int count = Integer.parseInt(Strings.subStringAfterFirst(line, " "));
word = Turkish.capitalize(word.substring(1));
if (count < 50) {
continue;
}
if (ignore.contains(word)) {
continue;
}
WordAnalysis parses = parserGenerator.analyze(word);
boolean found = false;
for (SingleAnalysis parse : parses) {
if (parse.getDictionaryItem().secondaryPos.equals(SecondaryPos.ProperNoun) && !parse.getDictionaryItem().hasAttribute(RootAttribute.Runtime)) {
found = true;
}
}
parserGenerator.invalidateCache();
if (found) {
continue;
}
if (word.length() < 4) {
continue;
}
histogram.add(word, count);
}
histogram.removeSmaller(165);
try (PrintWriter pw = new PrintWriter("proper")) {
histogram.getSortedList(Turkish.STRING_COMPARATOR_ASC).forEach(pw::println);
}
}
Aggregations