use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method frequentUnknown.
@Test
@Ignore("Not a Test.")
public void frequentUnknown() throws IOException {
Path wordFreqFile = DATA_PATH.resolve("vocab.all.freq");
Log.info("Loading histogram.");
Histogram<String> histogram = Histogram.loadFromUtf8File(wordFreqFile, ' ');
Path dir = DATA_PATH.resolve("out");
List<String> oflazerAll = Files.readAllLines(dir.resolve("oflazer-parsed-words.txt"));
List<String> zemberekAll = Files.readAllLines(dir.resolve("zemberek-parsed-words.txt"));
histogram.removeAll(oflazerAll);
histogram.removeAll(zemberekAll);
histogram.removeSmaller(10);
Files.write(dir.resolve("no-parse-freq.txt"), histogram.getSortedList());
Files.write(dir.resolve("no-parse-tr.txt"), histogram.getSortedList((a, b) -> turkishCollator.compare(a, b)));
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method getFrequentZemberek.
@Test
@Ignore("Not a Test.")
public void getFrequentZemberek() throws IOException {
int min = 30;
Path wordFreqFile = DATA_PATH.resolve("vocab.all.freq");
Path outDir = DATA_PATH.resolve("out");
Log.info("Loading histogram.");
Histogram<String> histogram = Histogram.loadFromUtf8File(wordFreqFile, ' ');
List<String> all = TextIO.loadLines(outDir.resolve("zemberek-parsed-words.txt"));
List<String> result = all.stream().filter(s -> histogram.getCount(s) >= min).collect(Collectors.toList());
sortAndSave(outDir.resolve("zemberek-parsed-words-min" + min + ".txt"), result);
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class LoadProperNouns method main.
public static void main(String[] args) throws IOException {
TurkishMorphology parserGenerator = TurkishMorphology.createWithDefaults();
List<String> lines = Files.readAllLines(Paths.get("/home/afsina/Downloads/documents-export-2016-02-17/vocabulary-proper-full.tr.txt"));
Histogram<String> histogram = new Histogram<>();
Set<String> ignore = new HashSet<>(Files.readAllLines(Paths.get("morphology/src/main/resources/tr/proper-ignore")));
for (String line : lines) {
if (line.startsWith("_")) {
continue;
}
line = line.trim();
if (line.length() == 0) {
continue;
}
String word = Strings.subStringUntilFirst(line, " ");
int count = Integer.parseInt(Strings.subStringAfterFirst(line, " "));
word = Turkish.capitalize(word.substring(1));
if (count < 50) {
continue;
}
if (ignore.contains(word)) {
continue;
}
WordAnalysis parses = parserGenerator.analyze(word);
boolean found = false;
for (SingleAnalysis parse : parses) {
if (parse.getDictionaryItem().secondaryPos.equals(SecondaryPos.ProperNoun) && !parse.getDictionaryItem().hasAttribute(RootAttribute.Runtime)) {
found = true;
}
}
parserGenerator.invalidateCache();
if (found) {
continue;
}
if (word.length() < 4) {
continue;
}
histogram.add(word, count);
}
histogram.removeSmaller(165);
try (PrintWriter pw = new PrintWriter("proper")) {
histogram.getSortedList(Turkish.STRING_COMPARATOR_ASC).forEach(pw::println);
}
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class AutomaticLabelingExperiment method generateSetForLabelExperiment.
Set<String> generateSetForLabelExperiment(Path input, TurkishMorphology analyzer, boolean useRoots) throws IOException {
WebCorpus corpus = new WebCorpus("label", "labeled");
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
Log.info("Extracting data.");
Histogram<String> labelCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
List<String> labels = document.getLabels();
List<String> lowerCase = labels.stream().filter(s -> s.length() > 1).map(s -> s.toLowerCase(Turkish.LOCALE)).collect(Collectors.toList());
labelCounts.add(lowerCase);
}
labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-all"), " ");
Log.info("All label count = %d", labelCounts.size());
labelCounts.removeSmaller(15);
Log.info("Reduced label count = %d", labelCounts.size());
labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-reduced"), " ");
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
Set<Long> contentHash = new HashSet<>();
for (WebDocument document : corpus.getDocuments()) {
Long hash = document.getHash();
if (contentHash.contains(hash)) {
continue;
}
contentHash.add(hash);
List<String> labelTags = new ArrayList<>();
boolean labelFound = false;
for (String label : document.getLabels()) {
if (labelCounts.contains(label)) {
labelTags.add("__label__" + label.replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE));
labelFound = true;
}
}
if (!labelFound) {
continue;
}
String labelStr = String.join(" ", labelTags);
String content = document.getContentAsString();
String processed = processContent(morphology, content, useRoots);
if (processed.length() < 200) {
continue;
}
set.add("#" + document.getId() + " " + labelStr + " " + processed);
if (c++ % 1000 == 0) {
Log.info("%d processed.", c);
}
}
Log.info("Generate train and test set.");
Collections.shuffle(set, new Random(1));
return new LinkedHashSet<>(set);
}
use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.
the class CategoryPredictionExperiment method generateRawSet.
private void generateRawSet(Path input, Path train) throws IOException {
WebCorpus corpus = new WebCorpus("category", "category");
Log.info("Loading corpus from %s", input);
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
Histogram<String> categoryCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
String category = document.getCategory();
if (category.length() > 0) {
categoryCounts.add(category);
}
}
Log.info("All category count = %d", categoryCounts.size());
categoryCounts.removeSmaller(20);
for (String c : categoryCounts.getSortedList()) {
System.out.println(c + " " + categoryCounts.getCount(c));
}
Log.info("Reduced label count = %d", categoryCounts.size());
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
for (WebDocument document : corpus.getDocuments()) {
if (document.getCategory().length() == 0) {
continue;
}
if (document.getTitle().length() == 0) {
continue;
}
String title = document.getTitle();
String category = document.getCategory();
if (category.contains("CNN") || category.contains("Güncel") || category.contains("Euro 2016") || category.contains("Yazarlar") || category.contains("Ajanda")) {
continue;
}
if (category.equals("İyilik Sağlık")) {
category = "Sağlık";
}
if (category.equals("Spor Diğer")) {
category = "Spor";
}
if (category.equals("İyilik Sağlık")) {
category = "Sağlık";
}
if (categoryCounts.contains(category)) {
category = "__label__" + category.replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
} else {
continue;
}
set.add(category + " " + title);
if (c++ % 1000 == 0) {
Log.info("%d of %d processed.", c, corpus.documentCount());
}
}
Log.info("Generate raw set.");
Files.write(train, set, StandardCharsets.UTF_8);
}
Aggregations