use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.
the class CategoryPredictionExperiment method generateSets.
private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useLemmas) throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
WebCorpus corpus = new WebCorpus("category", "category");
Log.info("Loading corpus from %s", input);
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
Histogram<String> categoryCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
String category = document.getCategory();
if (category.length() > 0) {
categoryCounts.add(category);
}
}
Log.info("All category count = %d", categoryCounts.size());
categoryCounts.removeSmaller(20);
for (String c : categoryCounts.getSortedList()) {
System.out.println(c + " " + categoryCounts.getCount(c));
}
Log.info("Reduced label count = %d", categoryCounts.size());
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
for (WebDocument document : corpus.getDocuments()) {
if (document.getCategory().length() == 0) {
continue;
}
if (useOnlyTitle && document.getTitle().length() == 0) {
continue;
}
String content = document.getContentAsString();
String title = document.getTitle();
List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
List<String> reduced = new ArrayList<>(docTokens.size());
String category = document.getCategory();
if (categoryCounts.contains(category)) {
category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
} else {
continue;
}
for (Token token : docTokens) {
if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
continue;
}
String tokenStr = token.getText();
reduced.add(tokenStr);
}
String join = String.join(" ", reduced);
if (join.trim().isEmpty()) {
continue;
}
if (useLemmas) {
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(join);
List<String> res = new ArrayList<>();
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
if (best.isUnknown()) {
res.add(e.getWordAnalysis().getInput());
continue;
}
List<String> lemmas = best.getLemmas();
if (lemmas.size() == 0) {
continue;
}
res.add(lemmas.get(lemmas.size() - 1));
}
join = String.join(" ", res);
}
set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
if (c++ % 1000 == 0) {
Log.info("%d of %d processed.", c, corpus.documentCount());
}
}
Log.info("Generate train and test set.");
saveSets(train, test, new LinkedHashSet<>(set));
}
use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.
the class CategoryPredictionExperiment method runExperiment.
private void runExperiment() throws Exception {
Path corpusPath = experimentRoot.resolve("category.corpus");
Path train = experimentRoot.resolve("category.train");
Path test = experimentRoot.resolve("category.test");
Path titleRaw = experimentRoot.resolve("category.title");
Path modelPath = experimentRoot.resolve("category.model");
Path predictionPath = experimentRoot.resolve("category.predictions");
extractCategoryDocuments(rawCorpusRoot, corpusPath);
boolean useOnlyTitles = true;
boolean useLemmas = true;
generateSets(corpusPath, train, test, useOnlyTitles, useLemmas);
generateRawSet(corpusPath, titleRaw);
FastText fastText;
if (modelPath.toFile().exists()) {
Log.info("Reusing existing model %s", modelPath);
fastText = FastText.load(modelPath);
} else {
Args argz = Args.forSupervised();
argz.thread = 4;
argz.model = Args.model_name.supervised;
argz.loss = Args.loss_name.softmax;
argz.epoch = 50;
argz.wordNgrams = 2;
argz.minCount = 0;
argz.lr = 0.5;
argz.dim = 100;
argz.bucket = 5_000_000;
fastText = new FastTextTrainer(argz).train(train);
fastText.saveModel(modelPath);
}
EvaluationResult result = fastText.test(test, 1);
Log.info(result.toString());
WebCorpus corpus = new WebCorpus("corpus", "labeled");
corpus.addDocuments(WebCorpus.loadDocuments(corpusPath));
Log.info("Testing started.");
List<String> testLines = Files.readAllLines(test, StandardCharsets.UTF_8);
try (PrintWriter pw = new PrintWriter(predictionPath.toFile(), "utf-8")) {
for (String testLine : testLines) {
String id = testLine.substring(0, testLine.indexOf(' ')).substring(1);
WebDocument doc = corpus.getDocument(id);
List<ScoredItem<String>> res = fastText.predict(testLine, 3);
List<String> predictedCategories = new ArrayList<>();
for (ScoredItem<String> re : res) {
if (re.score < -10) {
continue;
}
predictedCategories.add(String.format(Locale.ENGLISH, "%s (%.2f)", re.item.replaceAll("__label__", "").replaceAll("_", " "), re.score));
}
pw.println("id = " + id);
pw.println();
pw.println(doc.getTitle());
pw.println();
pw.println("Actual Category = " + doc.getCategory());
pw.println("Predictions = " + String.join(", ", predictedCategories));
pw.println();
pw.println("------------------------------------------------------");
pw.println();
}
}
Log.info("Done.");
}
use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.
the class AutomaticLabelingExperiment method generateSetForLabelExperiment.
Set<String> generateSetForLabelExperiment(Path input, TurkishSentenceAnalyzer analyzer, boolean useRoots) throws IOException {
WebCorpus corpus = new WebCorpus("label", "labeled");
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
Log.info("Extracting data.");
Histogram<String> labelCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
List<String> labels = document.getLabels();
List<String> lowerCase = labels.stream().filter(s -> s.length() > 1).map(s -> s.toLowerCase(Turkish.LOCALE)).collect(Collectors.toList());
labelCounts.add(lowerCase);
}
labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-all"), " ");
Log.info("All label count = %d", labelCounts.size());
labelCounts.removeSmaller(15);
Log.info("Reduced label count = %d", labelCounts.size());
labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-reduced"), " ");
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
Set<Long> contentHash = new HashSet<>();
for (WebDocument document : corpus.getDocuments()) {
Long hash = document.getHash();
if (contentHash.contains(hash)) {
continue;
}
contentHash.add(hash);
List<String> labelTags = new ArrayList<>();
boolean labelFound = false;
for (String label : document.getLabels()) {
if (labelCounts.contains(label)) {
labelTags.add("__label__" + label.replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE));
labelFound = true;
}
}
if (!labelFound) {
continue;
}
String labelStr = String.join(" ", labelTags);
String content = document.getContentAsString();
String processed = processContent(analyzer, content, useRoots);
if (processed.length() < 200) {
continue;
}
set.add("#" + document.getId() + " " + labelStr + " " + processed);
if (c++ % 1000 == 0) {
Log.info("%d processed.", c);
}
}
Log.info("Generate train and test set.");
Collections.shuffle(set, new Random(1));
return new LinkedHashSet<>(set);
}
use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.
the class AutomaticLabelingExperiment method generateSetForLabelExperiment.
Set<String> generateSetForLabelExperiment(Path input, TurkishMorphology analyzer, boolean useRoots) throws IOException {
WebCorpus corpus = new WebCorpus("label", "labeled");
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
Log.info("Extracting data.");
Histogram<String> labelCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
List<String> labels = document.getLabels();
List<String> lowerCase = labels.stream().filter(s -> s.length() > 1).map(s -> s.toLowerCase(Turkish.LOCALE)).collect(Collectors.toList());
labelCounts.add(lowerCase);
}
labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-all"), " ");
Log.info("All label count = %d", labelCounts.size());
labelCounts.removeSmaller(15);
Log.info("Reduced label count = %d", labelCounts.size());
labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-reduced"), " ");
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
Set<Long> contentHash = new HashSet<>();
for (WebDocument document : corpus.getDocuments()) {
Long hash = document.getHash();
if (contentHash.contains(hash)) {
continue;
}
contentHash.add(hash);
List<String> labelTags = new ArrayList<>();
boolean labelFound = false;
for (String label : document.getLabels()) {
if (labelCounts.contains(label)) {
labelTags.add("__label__" + label.replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE));
labelFound = true;
}
}
if (!labelFound) {
continue;
}
String labelStr = String.join(" ", labelTags);
String content = document.getContentAsString();
String processed = processContent(morphology, content, useRoots);
if (processed.length() < 200) {
continue;
}
set.add("#" + document.getId() + " " + labelStr + " " + processed);
if (c++ % 1000 == 0) {
Log.info("%d processed.", c);
}
}
Log.info("Generate train and test set.");
Collections.shuffle(set, new Random(1));
return new LinkedHashSet<>(set);
}
use of zemberek.corpus.WebDocument in project zemberek-nlp by ahmetaa.
the class AutomaticLabelingExperiment method test.
private void test(Path corpusPath, Path testData, Path predictionPath, FastText fastText) throws IOException {
WebCorpus corpus = new WebCorpus("label", "label");
corpus.addDocuments(WebCorpus.loadDocuments(corpusPath));
Log.info("Testing started.");
List<String> testLines = Files.readAllLines(testData, StandardCharsets.UTF_8);
Stopwatch sw = Stopwatch.createStarted();
try (PrintWriter pw = new PrintWriter(predictionPath.toFile(), "utf-8")) {
for (String testLine : testLines) {
String id = testLine.substring(0, testLine.indexOf(' ')).substring(1);
WebDocument doc = corpus.getDocument(id);
List<ScoredItem<String>> res = fastText.predict(testLine, 7);
List<String> predictedLabels = new ArrayList<>();
for (ScoredItem<String> re : res) {
predictedLabels.add(String.format(Locale.ENGLISH, "%s (%.2f)", re.item.replaceAll("__label__", "").replaceAll("_", " "), re.score));
}
pw.println("id = " + id);
pw.println();
pw.println(doc.getContentAsString().replaceAll("[\n\r]+", "\n"));
pw.println();
pw.println("Actual Labels = " + String.join(", ", doc.getLabels()));
pw.println("Predictions = " + String.join(", ", predictedLabels));
pw.println();
pw.println("------------------------------------------------------");
pw.println();
}
}
Log.info("Done. in %d ms.", sw.elapsed(TimeUnit.MILLISECONDS));
}
Aggregations