use of zemberek.morphology.lexicon.RootLexicon in project lucene-solr-analysis-turkish by iorixxx.
the class Zemberek3StemFilterFactory method inform.
@Override
public void inform(ResourceLoader loader) throws IOException {
if (dictionaryFiles == null || dictionaryFiles.trim().isEmpty()) {
this.parser = TurkishWordParserGenerator.createWithDefaults().getParser();
// Use default dictionaries shipped with Zemberek3.
return;
}
List<String> lines = new ArrayList<>();
List<String> files = splitFileNames(dictionaryFiles);
if (files.size() > 0) {
for (String file : files) {
List<String> wlist = getLines(loader, file.trim());
lines.addAll(wlist);
}
}
if (lines.isEmpty()) {
this.parser = TurkishWordParserGenerator.createWithDefaults().getParser();
// Use default dictionaries shipped with Zemberek3.
return;
}
SuffixProvider suffixProvider = new TurkishSuffixes();
RootLexicon lexicon = new TurkishDictionaryLoader(suffixProvider).load(lines);
DynamicLexiconGraph graph = new DynamicLexiconGraph(suffixProvider);
graph.addDictionaryItems(lexicon);
parser = new WordParser(graph);
}
use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.
the class _MorphologicalAmbiguityResolverExperiment method collect.
private List<SingleAnalysisSentence> collect(Path p, int maxAnalysisCount) throws IOException {
List<String> sentences = getSentences(p);
RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
InterpretingAnalyzer analyzer = new InterpretingAnalyzer(lexicon);
int tokenCount = 0;
int sentenceCount = 0;
List<SingleAnalysisSentence> result = new ArrayList<>();
for (String sentence : sentences) {
sentence = sentence.replaceAll("\\s+|\\u00a0", " ");
sentence = sentence.replaceAll("[\\u00ad]", "");
sentence = sentence.replaceAll("[…]", "...");
List<Single> singleAnalysisWords = new ArrayList<>();
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
boolean failed = false;
int i = 0;
for (Token token : tokens) {
tokenCount++;
String rawWord = token.getText();
String word = Character.isUpperCase(rawWord.charAt(0)) ? Turkish.capitalize(rawWord) : rawWord.toLowerCase(Turkish.LOCALE);
List<_SingleAnalysis> results;
if (cache.containsKey(word)) {
results = cache.get(word);
} else {
results = analyzer.analyze(word);
cache.put(word, results);
}
if (results.size() == 0) {
if (Strings.containsNone(word, "0123456789-.")) {
failedWords.add(word);
}
}
if (results.size() < 1 || results.size() > maxAnalysisCount) {
failed = true;
break;
} else {
results = results.stream().filter(s -> !(s.getItem().secondaryPos == SecondaryPos.ProperNoun && Character.isLowerCase(rawWord.charAt(0)))).collect(Collectors.toList());
if (results.size() == 0) {
failed = true;
break;
}
singleAnalysisWords.add(new Single(word, i, results));
i++;
}
}
if (!failed) {
result.add(new SingleAnalysisSentence(sentence, singleAnalysisWords));
}
sentenceCount++;
if (sentenceCount % 2000 == 0) {
Log.info("%d sentences %d tokens analyzed. %d found", sentenceCount, tokenCount, result.size());
}
}
return result;
}
use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.
the class DictionaryOperations method saveProperNouns.
public static void saveProperNouns() throws IOException {
// TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
Set<String> set = new HashSet<>();
for (DictionaryItem item : lexicon) {
String lemma = item.lemma;
if (item.attributes.contains(RootAttribute.Dummy)) {
continue;
}
if (item.secondaryPos != SecondaryPos.ProperNoun) {
continue;
}
set.add(lemma);
}
List<String> list = new ArrayList<>(set);
list.sort(Turkish.STRING_COMPARATOR_ASC);
Files.write(Paths.get("zemberek.proper.vocab"), list);
}
use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.
the class CoverageTest method checkCoverage.
private void checkCoverage(ArrayDeque<String> lines) throws IOException, InterruptedException, java.util.concurrent.ExecutionException {
RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
InterpretingAnalyzer analyzer = new InterpretingAnalyzer(lexicon);
int threadCount = Runtime.getRuntime().availableProcessors() / 2;
Log.info("Thread count = %d", threadCount);
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
CompletionService<Result> service = new ExecutorCompletionService<>(executorService);
Result allResult = new Result(new ArrayList<>(100000), new ArrayList<>(1000000), lines.size());
Stopwatch sw = Stopwatch.createStarted();
int batchCount = 0;
int batchSize = 20_000;
while (!lines.isEmpty()) {
List<String> batch = new ArrayList<>(batchSize);
int j = 0;
while (j < batchSize && !lines.isEmpty()) {
batch.add(lines.poll());
j++;
}
if (batch.size() > 0) {
service.submit(() -> {
List<String> failed = new ArrayList<>(batchSize / 2);
List<String> passed = new ArrayList<>(batchSize);
for (String s : batch) {
String c = s.toLowerCase(Turkish.LOCALE).replaceAll("[']", "");
List<_SingleAnalysis> results = analyzer.analyze(c);
if (results.size() == 0) {
failed.add(s);
} else {
// passed.add(s);
}
}
return new Result(failed, passed, batch.size());
});
batchCount++;
}
}
int i = 0;
int total = 0;
while (i < batchCount) {
Result r = service.take().get();
allResult.failedWords.addAll(r.failedWords);
allResult.passedWords.addAll(r.passedWords);
total += r.wordCount;
if (total % (batchSize * 10) == 0) {
logResult(allResult.failedWords, total, sw);
}
i++;
}
logResult(allResult.failedWords, total, sw);
allResult.failedWords.sort(Turkish.STRING_COMPARATOR_ASC);
allResult.passedWords.sort(Turkish.STRING_COMPARATOR_ASC);
Files.write(Paths.get("../data/zemberek-oflazer/new-analyzer-failed.txt"), allResult.failedWords, StandardCharsets.UTF_8);
Files.write(Paths.get("../data/zemberek-oflazer/new-analyzer-passed.txt"), allResult.passedWords, StandardCharsets.UTF_8);
}
use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.
the class SpeedTest method testNewsCorpusNoCache.
@Test
@Ignore(value = "Speed Test.")
public void testNewsCorpusNoCache() throws IOException {
Path p = Paths.get("src/main/resources/corpora/cnn-turk-10k");
List<String> sentences = getSentences(p);
RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
InterpretingAnalyzer analyzer = new InterpretingAnalyzer(lexicon);
Stopwatch sw = Stopwatch.createStarted();
int tokenCount = 0;
int noAnalysis = 0;
int sentenceCount = 0;
Histogram<String> failedWords = new Histogram<>(100000);
for (String sentence : sentences) {
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
for (Token token : tokens) {
if (token.getType() == TurkishLexer.Punctuation) {
continue;
}
tokenCount++;
List<_SingleAnalysis> results = analyzer.analyze(token.getText());
if (results.size() == 0) {
noAnalysis++;
failedWords.add(token.getText());
}
}
sentenceCount++;
if (sentenceCount % 2000 == 0) {
Log.info("%d tokens analyzed.", tokenCount);
}
}
double seconds = sw.stop().elapsed(TimeUnit.MILLISECONDS) / 1000d;
double speed = tokenCount / seconds;
double parseRatio = 100 - (noAnalysis * 100d / tokenCount);
Log.info("%nElapsed = %.2f seconds", seconds);
Log.info("%nToken Count (No Punc) = %d %nParse Ratio = %.4f%nSpeed = %.2f tokens/sec%n", tokenCount, parseRatio, speed);
Log.info("Saving Unknown Tokens");
failedWords.saveSortedByCounts(Paths.get("unk.freq"), " ");
failedWords.saveSortedByKeys(Paths.get("unk"), " ", Turkish.STRING_COMPARATOR_ASC);
}
Aggregations