use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class CategoryPredictionExperiment method generateSets.
private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useLemmas) throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
WebCorpus corpus = new WebCorpus("category", "category");
Log.info("Loading corpus from %s", input);
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
Histogram<String> categoryCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
String category = document.getCategory();
if (category.length() > 0) {
categoryCounts.add(category);
}
}
Log.info("All category count = %d", categoryCounts.size());
categoryCounts.removeSmaller(20);
for (String c : categoryCounts.getSortedList()) {
System.out.println(c + " " + categoryCounts.getCount(c));
}
Log.info("Reduced label count = %d", categoryCounts.size());
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
for (WebDocument document : corpus.getDocuments()) {
if (document.getCategory().length() == 0) {
continue;
}
if (useOnlyTitle && document.getTitle().length() == 0) {
continue;
}
String content = document.getContentAsString();
String title = document.getTitle();
List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
List<String> reduced = new ArrayList<>(docTokens.size());
String category = document.getCategory();
if (categoryCounts.contains(category)) {
category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
} else {
continue;
}
for (Token token : docTokens) {
if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
continue;
}
String tokenStr = token.getText();
reduced.add(tokenStr);
}
String join = String.join(" ", reduced);
if (join.trim().isEmpty()) {
continue;
}
if (useLemmas) {
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(join);
List<String> res = new ArrayList<>();
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
if (best.isUnknown()) {
res.add(e.getWordAnalysis().getInput());
continue;
}
List<String> lemmas = best.getLemmas();
if (lemmas.size() == 0) {
continue;
}
res.add(lemmas.get(lemmas.size() - 1));
}
join = String.join(" ", res);
}
set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
if (c++ % 1000 == 0) {
Log.info("%d of %d processed.", c, corpus.documentCount());
}
}
Log.info("Generate train and test set.");
saveSets(train, test, new LinkedHashSet<>(set));
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class WordHistogram method generateHistograms.
static void generateHistograms(List<String> paragraphs, Path outRoot) throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Histogram<String> roots = new Histogram<>(1000_000);
Histogram<String> words = new Histogram<>(1000_000);
int paragraphCounter = 0;
int sentenceCounter = 0;
int tokenCounter = 0;
for (String paragraph : paragraphs) {
List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph(paragraph);
sentenceCounter += sentences.size();
for (String sentence : sentences) {
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
tokenCounter += tokens.size();
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
if (best.getPos() == PrimaryPos.Numeral || best.getPos() == PrimaryPos.Punctuation) {
continue;
}
if (best.isUnknown()) {
continue;
}
if (best.isRuntime() && !Strings.containsNone(e.getWordAnalysis().getInput(), "01234567890")) {
continue;
}
List<String> lemmas = best.getLemmas();
if (lemmas.size() == 0) {
continue;
}
roots.add(best.getDictionaryItem().lemma);
String w = e.getWordAnalysis().getInput();
if (best.getDictionaryItem().secondaryPos != SecondaryPos.ProperNoun) {
w = w.toLowerCase(Turkish.LOCALE);
} else {
w = Turkish.capitalize(w);
}
words.add(w);
}
}
paragraphCounter++;
if (paragraphCounter % 1000 == 0) {
System.out.println(paragraphCounter + " of " + paragraphs.size());
}
}
System.out.println("tokenCounter = " + tokenCounter);
System.out.println("sentenceCounter = " + sentenceCounter);
Files.createDirectories(outRoot);
roots.saveSortedByCounts(outRoot.resolve("roots.freq.txt"), " ");
roots.saveSortedByKeys(outRoot.resolve("roots.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
words.saveSortedByCounts(outRoot.resolve("words.freq.txt"), " ");
words.saveSortedByKeys(outRoot.resolve("words.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
words.removeSmaller(10);
words.saveSortedByCounts(outRoot.resolve("words10.freq.txt"), " ");
words.saveSortedByKeys(outRoot.resolve("words10.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class ClassificationConsole method removeNonWords.
private String removeNonWords(String sentence) {
List<Token> docTokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
List<String> reduced = new ArrayList<>(docTokens.size());
for (Token token : docTokens) {
if (token.getType() == Type.PercentNumeral || token.getType() == Type.Number || token.getType() == Type.Punctuation || token.getType() == Type.RomanNumeral || token.getType() == Type.Time || token.getType() == Type.UnknownWord || token.getType() == Type.Unknown) {
if (!token.getText().contains("__")) {
continue;
}
}
String tokenStr = token.getText();
reduced.add(tokenStr);
}
return String.join(" ", reduced);
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class TurkishTokenizationExample method tokenIterator.
public static void tokenIterator() {
System.out.println("Low level tokenization iterator using Ant-lr Lexer.");
String input = "İstanbul'a, merhaba!";
System.out.println("Input = " + input);
Iterator<Token> tokenIterator = tokenizer.getTokenIterator(input);
while (tokenIterator.hasNext()) {
Token token = tokenIterator.next();
System.out.println(token);
}
}
use of zemberek.tokenization.Token in project zemberek-nlp by ahmetaa.
the class TurkishSentenceNormalizer method normalize.
public String normalize(String sentence) {
if (sentence.trim().length() == 0) {
return sentence;
}
String processed = preProcess(sentence);
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(processed);
List<Candidates> candidatesList = new ArrayList<>();
for (int i = 0; i < tokens.size(); i++) {
Token currentToken = tokens.get(i);
String current = currentToken.getText();
String next = i == tokens.size() - 1 ? null : tokens.get(i + 1).getText();
String previous = i == 0 ? null : tokens.get(i - 1).getText();
LinkedHashSet<String> candidates = new LinkedHashSet<>(2);
// add matches from manual lookup
candidates.addAll(lookupManual.get(current));
// add matches from random walk
candidates.addAll(lookupFromGraph.get(current));
// add matches from ascii equivalents.
// TODO: this may decrease accuracy. Also, this can be eliminated with ascii tolerant analyzer.
candidates.addAll(lookupFromAscii.get(current));
// add matches from informal analysis to formal surface conversion.
WordAnalysis analyses = informalAsciiTolerantMorphology.analyze(current);
for (SingleAnalysis analysis : analyses) {
if (analysis.containsInformalMorpheme()) {
WordGenerator.Result result = analysisConverter.convert(current, analysis);
if (result != null) {
candidates.add(result.surface);
}
} else {
List<WordGenerator.Result> results = morphology.getWordGenerator().generate(analysis.getDictionaryItem(), analysis.getMorphemes());
for (Result result : results) {
candidates.add(result.surface);
}
}
}
// get top 3 1 distance matches.
if ((analyses.analysisCount() == 0) && current.length() > 3) {
List<String> spellCandidates = spellChecker.suggestForWord(current, previous, next, lm);
if (spellCandidates.size() > 3) {
spellCandidates = new ArrayList<>(spellCandidates.subList(0, 3));
}
candidates.addAll(spellCandidates);
}
// if still there is no match, add the word itself.
if (candidates.isEmpty() || morphology.analyze(current).isCorrect()) {
candidates.add(current);
}
Candidates result = new Candidates(currentToken.getText(), candidates.stream().map(Candidate::new).collect(Collectors.toList()));
candidatesList.add(result);
}
// Apply Viterbi decoding and return result.
return String.join(" ", decode(candidatesList));
}
Aggregations