use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class CategoryPredictionExperiment method generateSets.
private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useLemmas) throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
WebCorpus corpus = new WebCorpus("category", "category");
Log.info("Loading corpus from %s", input);
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
Histogram<String> categoryCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
String category = document.getCategory();
if (category.length() > 0) {
categoryCounts.add(category);
}
}
Log.info("All category count = %d", categoryCounts.size());
categoryCounts.removeSmaller(20);
for (String c : categoryCounts.getSortedList()) {
System.out.println(c + " " + categoryCounts.getCount(c));
}
Log.info("Reduced label count = %d", categoryCounts.size());
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
for (WebDocument document : corpus.getDocuments()) {
if (document.getCategory().length() == 0) {
continue;
}
if (useOnlyTitle && document.getTitle().length() == 0) {
continue;
}
String content = document.getContentAsString();
String title = document.getTitle();
List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
List<String> reduced = new ArrayList<>(docTokens.size());
String category = document.getCategory();
if (categoryCounts.contains(category)) {
category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
} else {
continue;
}
for (Token token : docTokens) {
if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
continue;
}
String tokenStr = token.getText();
reduced.add(tokenStr);
}
String join = String.join(" ", reduced);
if (join.trim().isEmpty()) {
continue;
}
if (useLemmas) {
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(join);
List<String> res = new ArrayList<>();
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
if (best.isUnknown()) {
res.add(e.getWordAnalysis().getInput());
continue;
}
List<String> lemmas = best.getLemmas();
if (lemmas.size() == 0) {
continue;
}
res.add(lemmas.get(lemmas.size() - 1));
}
join = String.join(" ", res);
}
set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
if (c++ % 1000 == 0) {
Log.info("%d of %d processed.", c, corpus.documentCount());
}
}
Log.info("Generate train and test set.");
saveSets(train, test, new LinkedHashSet<>(set));
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class AmbiguityStats method ambiguousWordStats.
public void ambiguousWordStats(String filename) throws IOException {
List<String> lines = readAll(filename);
Histogram<String> uniques = new Histogram<>(1000000);
int total = 0;
Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();
for (String line : lines) {
for (String s : splitter.split(line)) {
WordAnalysis results = parser.analyze(s);
total++;
if (total % 50000 == 0) {
System.out.println("Processed: " + total);
}
if (results.analysisCount() > 1) {
uniques.add(s);
}
}
}
System.out.println("Total: " + total);
Stats st = new Stats(0.002);
st.allCounts = (int) uniques.totalCount();
st.allUniques = uniques.size();
for (String s : uniques.getSortedList()) {
int count = uniques.getCount(s);
if (st.overCutoff(count)) {
String p1 = percentStr3(count, st.allCounts);
st.significantCounts += count;
st.significantUniques++;
System.out.println(s + " : " + count + " " + pp(p1));
}
}
st.dump();
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class NormalizationScripts method generateNormalizationVocabularies.
static void generateNormalizationVocabularies(TurkishMorphology morphology, Path cleanRoot, Path noisyRoot, Path outRoot) throws IOException {
Files.createDirectories(outRoot);
Histogram<String> correctFromNoisy = Histogram.loadFromUtf8File(noisyRoot.resolve("correct"), ' ');
Log.info("Correct from noisy Loaded");
Histogram<String> correctFromClean = Histogram.loadFromUtf8File(cleanRoot.resolve("correct"), ' ');
Log.info("Correct from clean Loaded");
correctFromClean.removeSmaller(2);
correctFromNoisy.removeSmaller(2);
Histogram<String> zero = new Histogram<>();
Histogram<String> zeroWordZeroLemma = new Histogram<>();
Histogram<String> zeroWordLowLemma = new Histogram<>();
Histogram<String> lowFreq = new Histogram<>();
Histogram<String> lowFreqLowLemmaFreq = new Histogram<>();
Histogram<String> unusualProper = new Histogram<>();
Histogram<String> unusualRoots = new Histogram<>();
Histogram<String> ignore = new Histogram<>();
double nTotal = correctFromNoisy.totalCount();
double cTotal = correctFromClean.totalCount();
for (String s : correctFromNoisy) {
if (s.contains(".")) {
ignore.add(s);
continue;
}
int nCount = correctFromNoisy.getCount(s);
double nFreq = nCount / nTotal;
WordAnalysis an = morphology.analyze(s);
if (unusualProper(an)) {
unusualProper.add(s, correctFromNoisy.getCount(s));
continue;
}
if (unusualRoot(an)) {
unusualRoots.add(s, correctFromNoisy.getCount(s));
continue;
}
if (!correctFromClean.contains(s)) {
zero.add(s, nCount);
if (an.analysisCount() > 0) {
Set<String> allLemmas = new HashSet<>();
for (SingleAnalysis analysis : an) {
allLemmas.addAll(analysis.getLemmas());
}
boolean none = true;
boolean lowLemmaRatio = true;
// TODO: this is not the best way. try extracting lemma frequencies from correct from clean
for (String l : allLemmas) {
if (correctFromClean.contains(l)) {
none = false;
double lnf = correctFromNoisy.getCount(l) / nTotal;
double lcf = correctFromClean.getCount(l) / nTotal;
if (lnf / lcf > 10) {
lowLemmaRatio = false;
break;
}
}
}
if (none) {
zeroWordZeroLemma.add(s, nCount);
}
if (lowLemmaRatio) {
zeroWordLowLemma.add(s, nCount);
}
}
continue;
}
double cFreq = correctFromClean.getCount(s) / cTotal;
if (nFreq / cFreq > 30) {
lowFreq.add(s, nCount);
}
}
Log.info("Saving Possibly incorrect words.");
zero.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-zero"), " ");
zeroWordZeroLemma.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-zero-no-lemma"), " ");
zeroWordLowLemma.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-zero-low-lemma"), " ");
lowFreq.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-lowfreq"), " ");
Log.info("Creating vocabularies");
// ----------- noisy ------------
Histogram<String> noisy = new Histogram<>(1_000_000);
Histogram<String> noisyFromCleanCorpora = Histogram.loadFromUtf8File(cleanRoot.resolve("incorrect"), ' ');
Histogram<String> noisyFromNoisyCorpora = Histogram.loadFromUtf8File(noisyRoot.resolve("incorrect"), ' ');
Log.info("Incorrect words loaded.");
noisyFromCleanCorpora.removeSmaller(2);
noisyFromNoisyCorpora.removeSmaller(2);
noisy.add(noisyFromCleanCorpora);
noisy.add(noisyFromNoisyCorpora);
Histogram<String> possiblyIncorrect = new Histogram<>(1000_000);
possiblyIncorrect.add(zeroWordZeroLemma);
for (String lf : lowFreq) {
if (!possiblyIncorrect.contains(lf)) {
possiblyIncorrect.add(lf, zeroWordZeroLemma.getCount(lf));
}
}
int threshold = 2;
for (String z : zero) {
int c = zero.getCount(z);
if (!possiblyIncorrect.contains(z) && c > threshold) {
possiblyIncorrect.add(z, c);
}
}
Histogram<String> clean = new Histogram<>(1000_000);
clean.add(correctFromClean);
clean.add(correctFromNoisy);
for (String s : clean) {
if (s.contains(".")) {
ignore.add(s);
}
}
clean.removeAll(ignore);
Histogram<String> asciiDuplicates = getAsciiDuplicates(clean);
asciiDuplicates.saveSortedByCounts(outRoot.resolve("ascii-dups"), " ");
possiblyIncorrect.add(asciiDuplicates);
unusualProper.saveSortedByCounts(outRoot.resolve("unusual-proper"), " ");
for (String s : unusualProper) {
if (!possiblyIncorrect.contains(s)) {
possiblyIncorrect.add(s, unusualProper.getCount(s));
}
}
unusualRoots.saveSortedByCounts(outRoot.resolve("unusual-root"), " ");
for (String s : unusualRoots) {
if (!possiblyIncorrect.contains(s)) {
possiblyIncorrect.add(s, unusualRoots.getCount(s));
}
}
possiblyIncorrect.removeAll(ignore);
clean.removeAll(asciiDuplicates);
clean.removeAll(unusualProper);
clean.removeAll(unusualRoots);
clean.removeAll(possiblyIncorrect);
Set<String> intersectionOfKeys = noisy.getIntersectionOfKeys(clean);
int sharedKeyCount = intersectionOfKeys.size();
if (sharedKeyCount > 0) {
Log.warn("Incorrect and correct sets share %d keys", sharedKeyCount);
}
sharedKeyCount = noisy.getIntersectionOfKeys(possiblyIncorrect).size();
if (sharedKeyCount > 0) {
Log.warn("Incorrect and possibly incorrect sets share %d keys", sharedKeyCount);
}
sharedKeyCount = clean.getIntersectionOfKeys(possiblyIncorrect).size();
if (sharedKeyCount > 0) {
Log.warn("Correct and possibly incorrect sets share %d keys", sharedKeyCount);
}
Log.info("Saving sets.");
clean.saveSortedByCounts(outRoot.resolve("correct"), " ");
Log.info("Correct words saved.");
noisy.saveSortedByCounts(outRoot.resolve("incorrect"), " ");
Log.info("Incorrect words saved.");
possiblyIncorrect.saveSortedByCounts(outRoot.resolve("possibly-incorrect"), " ");
Log.info("Possibly Incorrect words saved.");
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class DataConverter method extract.
private static void extract(Path dataPath, Path output) throws IOException {
DataSet set = com.google.common.io.Files.asCharSource(dataPath.toFile(), Charsets.UTF_8).readLines(new DataSetLoader());
TurkishMorphology morphology = TurkishMorphology.create(RootLexicon.builder().addTextDictionaryResources("tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict", "tr/person-names.dict").build());
List<SentenceAnalysis> result = new ArrayList<>();
Histogram<String> parseFails = new Histogram<>();
for (SentenceData sentenceData : set) {
// System.out.println(sentenceData.correctParse);
List<String> tokens = Splitter.on(" ").splitToList(sentenceData.sentence());
if (tokens.size() == 0 || tokens.size() != sentenceData.correctParse.size()) {
continue;
}
List<SentenceWordAnalysis> correctList = new ArrayList<>();
for (int i = 0; i < tokens.size(); i++) {
String s = tokens.get(i);
String p = sentenceData.correctParse.get(i);
p = p.replaceAll("PCNom", "PCNOM");
p = p.replaceAll("Pnon|Nom", "");
p = p.replaceAll("\\+Pos\\+", "+");
p = p.replaceAll("\\+Pos\\^DB", "^DB");
p = p.replaceAll("[+]+", "+");
p = p.replaceAll("[+]$", "");
p = p.replaceAll("[+]\\^DB", "^DB");
p = p.replaceAll("[.]", "");
p = p.toLowerCase(Turkish.LOCALE);
p = p.replaceAll("adverb", "adv");
p = p.replaceAll("\\+cop\\+a3sg", "+a3sg+cop");
p = p.replaceAll("\\+Unable", "^DB+Verb+Able+Neg");
if (lookup.containsKey(p)) {
p = lookup.get(p);
}
WordAnalysis a = morphology.analyze(s);
if (!a.isCorrect()) {
break;
}
SingleAnalysis best = null;
for (SingleAnalysis analysis : a) {
String of = convert(analysis);
if (of.equals(p)) {
best = analysis;
break;
}
}
if (best == null) {
if (Character.isUpperCase(s.charAt(0)) && (p.contains("+noun") && !p.contains("prop"))) {
String pp = p.replaceFirst("\\+noun", "\\+noun+prop");
for (SingleAnalysis analysis : a) {
String of = convert(analysis);
if (of.equals(pp)) {
best = analysis;
break;
}
}
}
}
if (best == null) {
List<String> z = a.getAnalysisResults().stream().map(DataConverter::convert).collect(Collectors.toList());
parseFails.add(s + " " + p);
} else {
correctList.add(new SentenceWordAnalysis(best, a));
}
}
if (correctList.size() == tokens.size()) {
result.add(new SentenceAnalysis(sentenceData.sentence(), correctList));
}
}
Scripts.saveUnambiguous(result, output);
parseFails.removeSmaller(3);
parseFails.saveSortedByCounts(Paths.get("parse-fails.txt"), " ");
System.out.format("Full Sentence Match = %d in %d%n", result.size(), set.sentences.size());
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class WordHistogram method generateHistograms.
static void generateHistograms(List<String> paragraphs, Path outRoot) throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Histogram<String> roots = new Histogram<>(1000_000);
Histogram<String> words = new Histogram<>(1000_000);
int paragraphCounter = 0;
int sentenceCounter = 0;
int tokenCounter = 0;
for (String paragraph : paragraphs) {
List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph(paragraph);
sentenceCounter += sentences.size();
for (String sentence : sentences) {
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
tokenCounter += tokens.size();
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
if (best.getPos() == PrimaryPos.Numeral || best.getPos() == PrimaryPos.Punctuation) {
continue;
}
if (best.isUnknown()) {
continue;
}
if (best.isRuntime() && !Strings.containsNone(e.getWordAnalysis().getInput(), "01234567890")) {
continue;
}
List<String> lemmas = best.getLemmas();
if (lemmas.size() == 0) {
continue;
}
roots.add(best.getDictionaryItem().lemma);
String w = e.getWordAnalysis().getInput();
if (best.getDictionaryItem().secondaryPos != SecondaryPos.ProperNoun) {
w = w.toLowerCase(Turkish.LOCALE);
} else {
w = Turkish.capitalize(w);
}
words.add(w);
}
}
paragraphCounter++;
if (paragraphCounter % 1000 == 0) {
System.out.println(paragraphCounter + " of " + paragraphs.size());
}
}
System.out.println("tokenCounter = " + tokenCounter);
System.out.println("sentenceCounter = " + sentenceCounter);
Files.createDirectories(outRoot);
roots.saveSortedByCounts(outRoot.resolve("roots.freq.txt"), " ");
roots.saveSortedByKeys(outRoot.resolve("roots.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
words.saveSortedByCounts(outRoot.resolve("words.freq.txt"), " ");
words.saveSortedByKeys(outRoot.resolve("words.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
words.removeSmaller(10);
words.saveSortedByCounts(outRoot.resolve("words10.freq.txt"), " ");
words.saveSortedByKeys(outRoot.resolve("words10.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
}
Aggregations