use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.
the class Syllables method getTwoConsonantStartWords.
public static void getTwoConsonantStartWords() throws IOException {
HashMultimap<String, String> map = HashMultimap.create();
RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
for (DictionaryItem item : lexicon) {
String lemma = item.lemma;
if (item.attributes.contains(RootAttribute.Dummy)) {
continue;
}
if (item.secondaryPos == SecondaryPos.Abbreviation) {
continue;
}
if (lemma.length() < 4 || TurkishAlphabet.INSTANCE.vowelCount(lemma) < 2) {
continue;
}
if (!TurkishAlphabet.INSTANCE.isVowel(lemma.charAt(0)) && !TurkishAlphabet.INSTANCE.isVowel(lemma.charAt(1))) {
map.put(lemma.substring(0, 2), lemma);
}
}
List<String> list = new ArrayList<>(map.keySet());
list.sort((a, b) -> Integer.compare(map.get(b).size(), map.get(a).size()));
List<String> result = new ArrayList<>();
List<String> acceptedPrefixes = new ArrayList<>();
for (String s : list) {
result.add(s + " " + String.join(",", map.get(s)));
if (Character.isUpperCase(s.charAt(0))) {
if (map.get(s).size() > 3) {
acceptedPrefixes.add(s.substring(0, 2).toLowerCase(Turkish.LOCALE));
}
} else {
acceptedPrefixes.add(s.substring(0, 2).toLowerCase(Turkish.LOCALE));
}
}
acceptedPrefixes = new ArrayList<>(new LinkedHashSet<>(acceptedPrefixes));
acceptedPrefixes.sort(Turkish.STRING_COMPARATOR_ASC);
Files.write(Paths.get("two-consonant-words"), list);
Files.write(Paths.get("two-consonant-words.all"), result);
Files.write(Paths.get("accepted-syllable-prefixes"), acceptedPrefixes);
}
use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.
the class SpeedTest method testNewsCorpusNoCache.
@Test
@Ignore(value = "Speed Test.")
public void testNewsCorpusNoCache() throws IOException {
Path p = Paths.get("src/main/resources/corpora/cnn-turk-10k");
List<String> sentences = getSentences(p);
RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
InterpretingAnalyzer analyzer = new InterpretingAnalyzer(lexicon);
Stopwatch sw = Stopwatch.createStarted();
int tokenCount = 0;
int noAnalysis = 0;
int sentenceCount = 0;
Histogram<String> failedWords = new Histogram<>(100000);
for (String sentence : sentences) {
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
for (Token token : tokens) {
if (token.getType() == TurkishLexer.Punctuation) {
continue;
}
tokenCount++;
List<_SingleAnalysis> results = analyzer.analyze(token.getText());
if (results.size() == 0) {
noAnalysis++;
failedWords.add(token.getText());
}
}
sentenceCount++;
if (sentenceCount % 2000 == 0) {
Log.info("%d tokens analyzed.", tokenCount);
}
}
double seconds = sw.stop().elapsed(TimeUnit.MILLISECONDS) / 1000d;
double speed = tokenCount / seconds;
double parseRatio = 100 - (noAnalysis * 100d / tokenCount);
Log.info("%nElapsed = %.2f seconds", seconds);
Log.info("%nToken Count (No Punc) = %d %nParse Ratio = %.4f%nSpeed = %.2f tokens/sec%n", tokenCount, parseRatio, speed);
Log.info("Saving Unknown Tokens");
failedWords.saveSortedByCounts(Paths.get("unk.freq"), " ");
failedWords.saveSortedByKeys(Paths.get("unk"), " ", Turkish.STRING_COMPARATOR_ASC);
}
use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.
the class ParseConsole method addTextDictionaryResources.
public static RootLexicon addTextDictionaryResources(String... resources) throws IOException {
RootLexicon lexicon = new RootLexicon();
Log.info("Dictionaries :%s", String.join(", ", Arrays.asList(resources)));
List<String> lines = new ArrayList<>();
for (String resource : resources) {
lines.addAll(Resources.readLines(Resources.getResource(resource), Charsets.UTF_8));
}
lexicon.addAll(new TurkishDictionaryLoader().load(lines));
Log.info("Lexicon Generated.");
return lexicon;
}
use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.
the class WordAnalyzerFunctionalTest method getLexiconGraph.
private DynamicLexiconGraph getLexiconGraph(File... dictionaries) throws IOException {
SuffixProvider suffixProvider = suffixes;
RootLexicon lexicon = new RootLexicon();
for (File dictionary : dictionaries) {
new TurkishDictionaryLoader().loadInto(lexicon, dictionary);
}
DynamicLexiconGraph graph = new DynamicLexiconGraph(suffixProvider);
graph.addDictionaryItems(lexicon);
return graph;
}
use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.
the class SimpleGeneratorTest method getLexicon.
private DynamicLexiconGraph getLexicon() throws IOException {
RootLexicon items = new TurkishDictionaryLoader().load(new File(Resources.getResource("dev-lexicon.txt").getFile()));
DynamicLexiconGraph graph = new DynamicLexiconGraph(suffixProvider);
graph.addDictionaryItems(items);
return graph;
}
Aggregations