use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class MorphologyConsole method run.
@Override
public void run() {
Builder b = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault());
if (disableUnknownAnalysis) {
b.disableUnidentifiedTokenAnalyzer();
}
if (enableInformalWordAnalysis) {
b.useInformalAnalysis();
}
TurkishMorphology morphology = b.build();
String input;
System.out.println("Enter word or sentence. Type `quit` or `Ctrl+C` to exit.:");
Scanner sc = new Scanner(System.in);
input = sc.nextLine();
while (!input.equals("quit")) {
if (input.trim().length() == 0) {
System.out.println("Empty line cannot be processed.");
input = sc.nextLine();
continue;
}
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(input);
System.out.format("%nS:%s%n", input);
for (SentenceWordAnalysis sw : analysis) {
WordAnalysis wa = sw.getWordAnalysis();
System.out.println(wa.getInput());
SingleAnalysis best = sw.getBestAnalysis();
for (SingleAnalysis singleAnalysis : wa) {
boolean isBest = singleAnalysis.equals(best);
if (wa.analysisCount() == 1) {
System.out.println(singleAnalysis.formatLong());
} else {
System.out.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
}
}
}
System.out.println();
input = sc.nextLine();
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class NormalizationScripts method splitWords.
static void splitWords(Path noisyVocab, Path cleanVocab, Path splitFile, Path lmPath, Path asciiMapPath, TurkishMorphology morphology, int minWordCount) throws IOException {
Set<String> asciiMapKeys = Files.readAllLines(asciiMapPath).stream().map(s -> s.substring(0, s.indexOf('='))).collect(Collectors.toSet());
SmoothLm lm = SmoothLm.builder(lmPath).logBase(Math.E).build();
Log.info("Language model = %s", lm.info());
Histogram<String> wordFreq = Histogram.loadFromUtf8File(noisyVocab.resolve("incorrect"), ' ');
wordFreq.add(Histogram.loadFromUtf8File(cleanVocab.resolve("incorrect"), ' '));
Log.info("%d words loaded.", wordFreq.size());
wordFreq.removeSmaller(minWordCount);
if (minWordCount > 1) {
Log.info("%d words left after removing counts less than %d.", wordFreq.size(), minWordCount);
}
int unkIndex = lm.getVocabulary().getUnknownWordIndex();
try (PrintWriter pw = new PrintWriter(splitFile.toFile(), "utf-8");
PrintWriter pwFreq = new PrintWriter(splitFile.toFile().getAbsolutePath() + "freq", "utf-8")) {
for (String word : wordFreq.getSortedList()) {
if (asciiMapKeys.contains(word)) {
continue;
}
if (word.length() < 5 || word.contains("-")) {
continue;
}
List<ScoredItem<String>> k = new ArrayList<>();
for (int i = 1; i < word.length() - 1; i++) {
String head = word.substring(0, i);
String tail = word.substring(i);
if (noSplitTails.contains(tail)) {
continue;
}
int hi = lm.getVocabulary().indexOf(head);
int ti = lm.getVocabulary().indexOf(tail);
if (hi == unkIndex || ti == unkIndex) {
continue;
}
if ((tail.equals("de") || tail.equals("da")) && morphology.analyze(head).isCorrect()) {
continue;
}
if (lm.ngramExists(hi, ti)) {
k.add(new ScoredItem<>(head + " " + tail, lm.getProbability(hi, ti)));
}
}
if (k.size() > 1) {
k.sort((a, b) -> Double.compare(b.score, a.score));
}
if (k.size() > 0) {
ScoredItem<String> best = k.get(0);
if (best.score > -7) {
pw.println(word + " = " + best.item);
pwFreq.println(word + " = " + best.item + " " + wordFreq.getCount(word));
}
}
}
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class NormalizationScripts method cleanTwitterData.
static void cleanTwitterData(Path in, Path out) throws Exception {
AnalysisCache cache = AnalysisCache.builder().dynamicCacheSize(300_000, 500_000).build();
TurkishMorphology morphology = TurkishMorphology.builder().setCache(cache).setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
int threadCount = Runtime.getRuntime().availableProcessors() / 2;
if (threadCount > 20) {
threadCount = 20;
}
ExecutorService executorService = new BlockingExecutor(threadCount);
CompletionService<TwitterSaver> service = new ExecutorCompletionService<>(executorService);
int blockSize = 20_000;
BlockTextLoader loader = BlockTextLoader.fromPath(in, blockSize);
Path foreign = Paths.get(out.toString() + ".foreign");
TwitterSaver saver = new TwitterSaver(out, foreign, blockSize);
int bc = 0;
for (TextChunk block : loader) {
service.submit(new TwitterTask(morphology, saver, block, bc));
bc++;
}
executorService.shutdown();
executorService.awaitTermination(1, TimeUnit.DAYS);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class AnalyzeAndConvertInformalWords method main.
public static void main(String[] args) {
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).useInformalAnalysis().build();
List<SingleAnalysis> analyses = morphology.analyzeAndDisambiguate("okuycam diyo").bestAnalysis();
for (SingleAnalysis a : analyses) {
System.out.println(a.surfaceForm() + "-" + a);
}
System.out.println("Converting formal surface form:");
InformalAnalysisConverter converter = new InformalAnalysisConverter(morphology.getWordGenerator());
for (SingleAnalysis a : analyses) {
System.out.println(converter.convert(a.surfaceForm(), a));
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class DisambiguateSentences method main.
public static void main(String[] args) {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
String sentence = "Bol baharatlı bir yemek yaptıralım.";
Log.info("Sentence = " + sentence);
List<WordAnalysis> analyses = morphology.analyzeSentence(sentence);
Log.info("Sentence word analysis result:");
for (WordAnalysis entry : analyses) {
Log.info("Word = " + entry.getInput());
for (SingleAnalysis analysis : entry) {
Log.info(analysis.formatLong());
}
}
SentenceAnalysis result = morphology.disambiguate(sentence, analyses);
Log.info("\nAfter ambiguity resolution : ");
result.bestAnalysis().forEach(Log::info);
}
Aggregations