use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.
the class CoverageTest method checkCoverage.
private void checkCoverage(ArrayDeque<String> lines) throws IOException, InterruptedException, java.util.concurrent.ExecutionException {
RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
TurkishMorphotactics morphotactics = new TurkishMorphotactics(lexicon);
RuleBasedAnalyzer analyzer = RuleBasedAnalyzer.instance(morphotactics);
int threadCount = Runtime.getRuntime().availableProcessors() / 2;
Log.info("Thread count = %d", threadCount);
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
CompletionService<Result> service = new ExecutorCompletionService<>(executorService);
Result allResult = new Result(new ArrayList<>(100000), new ArrayList<>(1000000), lines.size());
Stopwatch sw = Stopwatch.createStarted();
int batchCount = 0;
int batchSize = 20_000;
while (!lines.isEmpty()) {
List<String> batch = new ArrayList<>(batchSize);
int j = 0;
while (j < batchSize && !lines.isEmpty()) {
batch.add(lines.poll());
j++;
}
if (batch.size() > 0) {
service.submit(() -> {
List<String> failed = new ArrayList<>(batchSize / 2);
List<String> passed = new ArrayList<>(batchSize);
for (String s : batch) {
String c = s.toLowerCase(Turkish.LOCALE).replaceAll("[']", "");
List<SingleAnalysis> results = analyzer.analyze(c);
if (results.size() == 0) {
failed.add(s);
} else {
// passed.add(s);
}
}
return new Result(failed, passed, batch.size());
});
batchCount++;
}
}
int i = 0;
int total = 0;
while (i < batchCount) {
Result r = service.take().get();
allResult.failedWords.addAll(r.failedWords);
allResult.passedWords.addAll(r.passedWords);
total += r.wordCount;
if (total % (batchSize * 10) == 0) {
logResult(allResult.failedWords, total, sw);
}
i++;
}
logResult(allResult.failedWords, total, sw);
allResult.failedWords.sort(Turkish.STRING_COMPARATOR_ASC);
allResult.passedWords.sort(Turkish.STRING_COMPARATOR_ASC);
Files.write(Paths.get("../data/zemberek-oflazer/new-analyzer-failed.txt"), allResult.failedWords, StandardCharsets.UTF_8);
Files.write(Paths.get("../data/zemberek-oflazer/new-analyzer-passed.txt"), allResult.passedWords, StandardCharsets.UTF_8);
}
use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.
the class TurkishStopWords method generateFromDictionary.
static TurkishStopWords generateFromDictionary() throws IOException {
Set<PrimaryPos> pos = Sets.newHashSet(PrimaryPos.Adverb, PrimaryPos.Conjunction, PrimaryPos.Determiner, PrimaryPos.Interjection, PrimaryPos.PostPositive, PrimaryPos.Numeral, PrimaryPos.Pronoun, PrimaryPos.Question);
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Set<String> set = new HashSet<>();
RootLexicon lexicon = morphology.getLexicon();
for (DictionaryItem item : lexicon) {
if (pos.contains(item.primaryPos)) {
set.add(item.lemma);
}
}
List<String> str = new ArrayList<>(set);
str.sort(Turkish.STRING_COMPARATOR_ASC);
return new TurkishStopWords(new LinkedHashSet<>(str));
}
use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.
the class NormalizationVocabularyGenerator method getTurkishMorphology.
static TurkishMorphology getTurkishMorphology(boolean asciiTolerant) throws IOException {
AnalysisCache cache = AnalysisCache.builder().dynamicCacheSize(200_000, 400_000).build();
RootLexicon lexicon = TurkishDictionaryLoader.loadFromResources("tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict", "tr/person-names.dict");
TurkishMorphology.Builder builder = TurkishMorphology.builder().setLexicon(lexicon).disableUnidentifiedTokenAnalyzer().setCache(cache);
if (asciiTolerant) {
builder.ignoreDiacriticsInAnalysis();
}
return builder.build();
}
use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.
the class TurkishDictionaryLoaderTest method saveFullAttributes.
@Test
@Ignore("Not a unit test")
public void saveFullAttributes() throws IOException {
RootLexicon items = TurkishDictionaryLoader.loadDefaultDictionaries();
PrintWriter p = new PrintWriter(new File("dictionary-all-attributes.txt"), "utf-8");
for (DictionaryItem item : items) {
p.println(item.toString());
}
}
use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method dictionaryObsoleteCircumflexWordsCheck.
@Test
@Ignore("Not a Test.")
public void dictionaryObsoleteCircumflexWordsCheck() throws IOException {
Path path = Paths.get("../data/vocabulary/words-with-circumflex-obsolete.txt");
List<String> obsolete = Files.readAllLines(path, StandardCharsets.UTF_8);
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
RootLexicon lexicon = morphology.getLexicon();
List<String> single = new ArrayList<>();
for (String s : obsolete) {
List<DictionaryItem> items = lexicon.getMatchingItems(s);
List<DictionaryItem> matchingItems = lexicon.getMatchingItems(TurkishAlphabet.INSTANCE.normalizeCircumflex(s));
items.addAll(matchingItems);
Log.info("%s = %s", s, items);
if (items.size() == 1) {
String line = items.get(0).toString();
line = line.replace("[P:Noun]", "").trim();
line = line.replace("[P:Noun, Prop]", "").trim();
line = line.replace("P:Noun; ", "").trim();
line = line.replace("P:Noun, Prop; ", "").trim();
line = line.replace("P:Verb; ", "").trim();
line = line.replace("[A:Voicing]", "").trim();
single.add(line.replaceAll("\\s+", " ").trim());
}
}
Path pathSingle = Paths.get("../data/vocabulary/words-with-circumflex-obsolete-single.txt");
Files.write(pathSingle, single, StandardCharsets.UTF_8);
}
Aggregations