use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method parseLargeVocabularyZemberek.
@Test
@Ignore("Not a Test.")
public void parseLargeVocabularyZemberek() throws IOException {
// Path wordFreqFile = DATA_PATH.resolve("vocab.all.freq");
Path wordFreqFile = DATA_PATH.resolve("all-counts-sorted-freq.txt");
Path outDir = DATA_PATH.resolve("out");
Files.createDirectories(outDir);
TurkishMorphology parser = TurkishMorphology.createWithDefaults();
Log.info("Loading histogram.");
Histogram<String> histogram = Histogram.loadFromUtf8File(wordFreqFile, ' ');
List<String> accepted = new ArrayList<>(histogram.size() / 3);
int c = 0;
for (String s : histogram) {
try {
WordAnalysis parses = parser.analyze(s);
List<SingleAnalysis> analyses = parses.getAnalysisResults();
if (analyses.size() > 0 && analyses.get(0).getDictionaryItem().primaryPos != PrimaryPos.Unknown) {
accepted.add(s);
}
if (c > 0 && c % 10000 == 0) {
Log.info("Processed = " + c);
}
c++;
} catch (Exception e) {
Log.info("Exception in %s", s);
}
}
save(outDir.resolve("zemberek-parsed-words.txt"), accepted);
sortAndSave(outDir.resolve("zemberek-parsed-words.tr.txt"), accepted);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method generateOnlyOflazerWithAnalyzer.
@Test
@Ignore("Not a Test.")
public void generateOnlyOflazerWithAnalyzer() throws IOException {
Path inPath = DATA_PATH.resolve("out");
List<String> oflazer = Files.readAllLines(inPath.resolve("only-oflazer-2.txt"));
Log.info("Oflazer Loaded. %d words.", oflazer.size());
List<String> result = new ArrayList<>(oflazer.size() / 10);
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
int i = 0;
for (String s : oflazer) {
if (!morphology.analyze(s).isCorrect()) {
result.add(s);
}
if (i++ % 20000 == 0) {
Log.info("%d processed.", i);
}
}
Log.info("Writing.");
Files.write(inPath.resolve("only-oflazer-3.txt"), result);
Log.info("Oflazer-only saved.");
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method extractCircumflexWords.
@Test
@Ignore("Not a Test.")
public void extractCircumflexWords() throws IOException {
Path inPath = DATA_PATH.resolve("out");
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
LinkedHashSet<String> result = new LinkedHashSet<>();
for (DictionaryItem i : morphology.getLexicon()) {
if (TurkishAlphabet.INSTANCE.containsCircumflex(i.lemma)) {
result.add(i.lemma);
}
}
Log.info("Writing.");
Files.write(inPath.resolve("words-with-circumflex.txt"), result);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method createZemberekVocabulary.
@Test
@Ignore("Not a Test.")
public void createZemberekVocabulary() throws IOException {
Path outDir = DATA_PATH.resolve("out");
Files.createDirectories(outDir);
TurkishMorphology parser = TurkishMorphology.createWithDefaults();
List<String> vocab = new ArrayList<>(parser.getLexicon().size());
for (DictionaryItem item : parser.getLexicon()) {
vocab.add(item.lemma);
}
vocab.sort(turkishCollator::compare);
Files.write(outDir.resolve("zemberek.vocab"), vocab);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method dictionaryObsoleteCircumflexWordsCheck.
@Test
@Ignore("Not a Test.")
public void dictionaryObsoleteCircumflexWordsCheck() throws IOException {
Path path = Paths.get("../data/vocabulary/words-with-circumflex-obsolete.txt");
List<String> obsolete = Files.readAllLines(path, StandardCharsets.UTF_8);
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
RootLexicon lexicon = morphology.getLexicon();
List<String> single = new ArrayList<>();
for (String s : obsolete) {
List<DictionaryItem> items = lexicon.getMatchingItems(s);
List<DictionaryItem> matchingItems = lexicon.getMatchingItems(TurkishAlphabet.INSTANCE.normalizeCircumflex(s));
items.addAll(matchingItems);
Log.info("%s = %s", s, items);
if (items.size() == 1) {
String line = items.get(0).toString();
line = line.replace("[P:Noun]", "").trim();
line = line.replace("[P:Noun, Prop]", "").trim();
line = line.replace("P:Noun; ", "").trim();
line = line.replace("P:Noun, Prop; ", "").trim();
line = line.replace("P:Verb; ", "").trim();
line = line.replace("[A:Voicing]", "").trim();
single.add(line.replaceAll("\\s+", " ").trim());
}
}
Path pathSingle = Paths.get("../data/vocabulary/words-with-circumflex-obsolete-single.txt");
Files.write(pathSingle, single, StandardCharsets.UTF_8);
}
Aggregations