use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class AutomaticLabelingExperiment method processContent.
public String processContent(TurkishMorphology analyzer, String content, boolean useRoots) {
List<Token> docTokens = lexer.tokenize(content);
List<String> reduced = new ArrayList<>(docTokens.size());
for (Token token : docTokens) {
if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
continue;
}
String tokenStr = token.getText();
reduced.add(tokenStr);
}
String joined = String.join(" ", reduced);
if (useRoots) {
SentenceAnalysis analysis = analyzer.analyzeAndDisambiguate(joined);
List<String> res = new ArrayList<>();
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
if (best.isUnknown()) {
res.add(e.getWordAnalysis().getInput());
continue;
}
List<String> lemmas = best.getLemmas();
if (lemmas.size() == 0) {
continue;
}
res.add(lemmas.get(lemmas.size() - 1));
}
joined = String.join(" ", res);
}
return joined.replaceAll("[']", "").toLowerCase(Turkish.LOCALE);
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class _MorphologicalAmbiguityResolverExperiment method extractData.
public void extractData(Path p, Path outRoot, int maxAnalysisCount, int resultLimit) throws IOException {
List<Path> files = Files.walk(p, 1).filter(s -> s.toFile().isFile() && s.toFile().getName().endsWith(".corpus")).collect(Collectors.toList());
LinkedHashSet<SingleAnalysisSentence> result = new LinkedHashSet<>();
int i = 0;
for (Path file : files) {
List<SingleAnalysisSentence> collect = collect(file, maxAnalysisCount);
result.addAll(collect);
i++;
Log.info("%d of %d", i, files.size());
if (resultLimit > 0 && result.size() > resultLimit) {
break;
}
}
String s = p.toFile().getName();
Path out = outRoot.resolve(s + "-ambigious.txt");
try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) {
for (SingleAnalysisSentence sentence : result) {
pw.println(sentence.sentence);
for (Single single : sentence.tokens) {
for (SingleAnalysis r : single.res) {
pw.println(single.input);
pw.println(r.formatLong());
}
}
pw.println();
}
}
// saving failed words.
failedWords.saveSortedByKeys(outRoot.resolve(s + "-failed.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
// saving failed words by frequency.
failedWords.saveSortedByCounts(outRoot.resolve(s + "-failed.freq.txt"), " ");
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class _MorphologicalAmbiguityResolverExperiment method collect.
private List<SingleAnalysisSentence> collect(Path p, int maxAnalysisCount) throws IOException {
List<String> sentences = getSentences(p);
TurkishMorphology analyzer = TurkishMorphology.createWithDefaults();
int tokenCount = 0;
int sentenceCount = 0;
List<SingleAnalysisSentence> result = new ArrayList<>();
for (String sentence : sentences) {
sentence = sentence.replaceAll("\\s+|\\u00a0", " ");
sentence = sentence.replaceAll("[\\u00ad]", "");
sentence = sentence.replaceAll("[…]", "...");
List<Single> singleAnalysisWords = new ArrayList<>();
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
boolean failed = false;
int i = 0;
for (Token token : tokens) {
tokenCount++;
String rawWord = token.getText();
String word = Character.isUpperCase(rawWord.charAt(0)) ? Turkish.capitalize(rawWord) : rawWord.toLowerCase(Turkish.LOCALE);
WordAnalysis results;
if (cache.containsKey(word)) {
results = cache.get(word);
} else {
results = analyzer.analyze(word);
cache.put(word, results);
}
if (results.analysisCount() == 0) {
if (Strings.containsNone(word, "0123456789-.")) {
failedWords.add(word);
}
}
if (results.analysisCount() < 1 || results.analysisCount() > maxAnalysisCount) {
failed = true;
break;
} else {
List<SingleAnalysis> filtered = results.stream().filter(s -> !(s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun && Character.isLowerCase(rawWord.charAt(0)))).collect(Collectors.toList());
if (filtered.size() == 0) {
failed = true;
break;
}
singleAnalysisWords.add(new Single(word, i, results.copyFor(filtered)));
i++;
}
}
if (!failed) {
result.add(new SingleAnalysisSentence(sentence, singleAnalysisWords));
}
sentenceCount++;
if (sentenceCount % 2000 == 0) {
Log.info("%d sentences %d tokens analyzed. %d found", sentenceCount, tokenCount, result.size());
}
}
return result;
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class CategoryPredictionExperiment method generateSets.
private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useLemmas) throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
WebCorpus corpus = new WebCorpus("category", "category");
Log.info("Loading corpus from %s", input);
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
Histogram<String> categoryCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
String category = document.getCategory();
if (category.length() > 0) {
categoryCounts.add(category);
}
}
Log.info("All category count = %d", categoryCounts.size());
categoryCounts.removeSmaller(20);
for (String c : categoryCounts.getSortedList()) {
System.out.println(c + " " + categoryCounts.getCount(c));
}
Log.info("Reduced label count = %d", categoryCounts.size());
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
for (WebDocument document : corpus.getDocuments()) {
if (document.getCategory().length() == 0) {
continue;
}
if (useOnlyTitle && document.getTitle().length() == 0) {
continue;
}
String content = document.getContentAsString();
String title = document.getTitle();
List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
List<String> reduced = new ArrayList<>(docTokens.size());
String category = document.getCategory();
if (categoryCounts.contains(category)) {
category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
} else {
continue;
}
for (Token token : docTokens) {
if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
continue;
}
String tokenStr = token.getText();
reduced.add(tokenStr);
}
String join = String.join(" ", reduced);
if (join.trim().isEmpty()) {
continue;
}
if (useLemmas) {
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(join);
List<String> res = new ArrayList<>();
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
if (best.isUnknown()) {
res.add(e.getWordAnalysis().getInput());
continue;
}
List<String> lemmas = best.getLemmas();
if (lemmas.size() == 0) {
continue;
}
res.add(lemmas.get(lemmas.size() - 1));
}
join = String.join(" ", res);
}
set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
if (c++ % 1000 == 0) {
Log.info("%d of %d processed.", c, corpus.documentCount());
}
}
Log.info("Generate train and test set.");
saveSets(train, test, new LinkedHashSet<>(set));
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class NormalizationScripts method generateNormalizationVocabularies.
static void generateNormalizationVocabularies(TurkishMorphology morphology, Path cleanRoot, Path noisyRoot, Path outRoot) throws IOException {
Files.createDirectories(outRoot);
Histogram<String> correctFromNoisy = Histogram.loadFromUtf8File(noisyRoot.resolve("correct"), ' ');
Log.info("Correct from noisy Loaded");
Histogram<String> correctFromClean = Histogram.loadFromUtf8File(cleanRoot.resolve("correct"), ' ');
Log.info("Correct from clean Loaded");
correctFromClean.removeSmaller(2);
correctFromNoisy.removeSmaller(2);
Histogram<String> zero = new Histogram<>();
Histogram<String> zeroWordZeroLemma = new Histogram<>();
Histogram<String> zeroWordLowLemma = new Histogram<>();
Histogram<String> lowFreq = new Histogram<>();
Histogram<String> lowFreqLowLemmaFreq = new Histogram<>();
Histogram<String> unusualProper = new Histogram<>();
Histogram<String> unusualRoots = new Histogram<>();
Histogram<String> ignore = new Histogram<>();
double nTotal = correctFromNoisy.totalCount();
double cTotal = correctFromClean.totalCount();
for (String s : correctFromNoisy) {
if (s.contains(".")) {
ignore.add(s);
continue;
}
int nCount = correctFromNoisy.getCount(s);
double nFreq = nCount / nTotal;
WordAnalysis an = morphology.analyze(s);
if (unusualProper(an)) {
unusualProper.add(s, correctFromNoisy.getCount(s));
continue;
}
if (unusualRoot(an)) {
unusualRoots.add(s, correctFromNoisy.getCount(s));
continue;
}
if (!correctFromClean.contains(s)) {
zero.add(s, nCount);
if (an.analysisCount() > 0) {
Set<String> allLemmas = new HashSet<>();
for (SingleAnalysis analysis : an) {
allLemmas.addAll(analysis.getLemmas());
}
boolean none = true;
boolean lowLemmaRatio = true;
// TODO: this is not the best way. try extracting lemma frequencies from correct from clean
for (String l : allLemmas) {
if (correctFromClean.contains(l)) {
none = false;
double lnf = correctFromNoisy.getCount(l) / nTotal;
double lcf = correctFromClean.getCount(l) / nTotal;
if (lnf / lcf > 10) {
lowLemmaRatio = false;
break;
}
}
}
if (none) {
zeroWordZeroLemma.add(s, nCount);
}
if (lowLemmaRatio) {
zeroWordLowLemma.add(s, nCount);
}
}
continue;
}
double cFreq = correctFromClean.getCount(s) / cTotal;
if (nFreq / cFreq > 30) {
lowFreq.add(s, nCount);
}
}
Log.info("Saving Possibly incorrect words.");
zero.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-zero"), " ");
zeroWordZeroLemma.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-zero-no-lemma"), " ");
zeroWordLowLemma.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-zero-low-lemma"), " ");
lowFreq.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-lowfreq"), " ");
Log.info("Creating vocabularies");
// ----------- noisy ------------
Histogram<String> noisy = new Histogram<>(1_000_000);
Histogram<String> noisyFromCleanCorpora = Histogram.loadFromUtf8File(cleanRoot.resolve("incorrect"), ' ');
Histogram<String> noisyFromNoisyCorpora = Histogram.loadFromUtf8File(noisyRoot.resolve("incorrect"), ' ');
Log.info("Incorrect words loaded.");
noisyFromCleanCorpora.removeSmaller(2);
noisyFromNoisyCorpora.removeSmaller(2);
noisy.add(noisyFromCleanCorpora);
noisy.add(noisyFromNoisyCorpora);
Histogram<String> possiblyIncorrect = new Histogram<>(1000_000);
possiblyIncorrect.add(zeroWordZeroLemma);
for (String lf : lowFreq) {
if (!possiblyIncorrect.contains(lf)) {
possiblyIncorrect.add(lf, zeroWordZeroLemma.getCount(lf));
}
}
int threshold = 2;
for (String z : zero) {
int c = zero.getCount(z);
if (!possiblyIncorrect.contains(z) && c > threshold) {
possiblyIncorrect.add(z, c);
}
}
Histogram<String> clean = new Histogram<>(1000_000);
clean.add(correctFromClean);
clean.add(correctFromNoisy);
for (String s : clean) {
if (s.contains(".")) {
ignore.add(s);
}
}
clean.removeAll(ignore);
Histogram<String> asciiDuplicates = getAsciiDuplicates(clean);
asciiDuplicates.saveSortedByCounts(outRoot.resolve("ascii-dups"), " ");
possiblyIncorrect.add(asciiDuplicates);
unusualProper.saveSortedByCounts(outRoot.resolve("unusual-proper"), " ");
for (String s : unusualProper) {
if (!possiblyIncorrect.contains(s)) {
possiblyIncorrect.add(s, unusualProper.getCount(s));
}
}
unusualRoots.saveSortedByCounts(outRoot.resolve("unusual-root"), " ");
for (String s : unusualRoots) {
if (!possiblyIncorrect.contains(s)) {
possiblyIncorrect.add(s, unusualRoots.getCount(s));
}
}
possiblyIncorrect.removeAll(ignore);
clean.removeAll(asciiDuplicates);
clean.removeAll(unusualProper);
clean.removeAll(unusualRoots);
clean.removeAll(possiblyIncorrect);
Set<String> intersectionOfKeys = noisy.getIntersectionOfKeys(clean);
int sharedKeyCount = intersectionOfKeys.size();
if (sharedKeyCount > 0) {
Log.warn("Incorrect and correct sets share %d keys", sharedKeyCount);
}
sharedKeyCount = noisy.getIntersectionOfKeys(possiblyIncorrect).size();
if (sharedKeyCount > 0) {
Log.warn("Incorrect and possibly incorrect sets share %d keys", sharedKeyCount);
}
sharedKeyCount = clean.getIntersectionOfKeys(possiblyIncorrect).size();
if (sharedKeyCount > 0) {
Log.warn("Correct and possibly incorrect sets share %d keys", sharedKeyCount);
}
Log.info("Saving sets.");
clean.saveSortedByCounts(outRoot.resolve("correct"), " ");
Log.info("Correct words saved.");
noisy.saveSortedByCounts(outRoot.resolve("incorrect"), " ");
Log.info("Incorrect words saved.");
possiblyIncorrect.saveSortedByCounts(outRoot.resolve("possibly-incorrect"), " ");
Log.info("Possibly Incorrect words saved.");
}
Aggregations