use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method disambiguationExample.
@Test
@Ignore("Not a Test")
public void disambiguationExample() throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
String sentence = "Yarın kar yağacak.";
System.out.println("Sentence = " + sentence);
List<WordAnalysis> analysis = morphology.analyzeSentence(sentence);
System.out.println("Before disambiguation.");
for (WordAnalysis entry : analysis) {
System.out.println("Word = " + entry.getInput());
for (SingleAnalysis single : entry) {
System.out.println(single.formatLong());
}
}
System.out.println("\nAfter disambiguation.");
SentenceAnalysis after = morphology.disambiguate(sentence, analysis);
after.bestAnalysis().forEach(s -> System.out.println(s.formatLong()));
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method failedWordTestIssue124.
@Test
@Ignore("Not a Test.")
public void failedWordTestIssue124() throws IOException {
Path failPath = DATA_PATH.resolve("fails.txt");
LinkedHashSet<String> words = new LinkedHashSet<>(Files.readAllLines(failPath, StandardCharsets.UTF_8));
LinkedHashSet<String> accepted = new LinkedHashSet<>();
TurkishMorphology parser = TurkishMorphology.createWithDefaults();
for (String s : words) {
WordAnalysis parses = parser.analyze(s);
List<SingleAnalysis> analyses = parses.getAnalysisResults();
for (SingleAnalysis parse : analyses) {
if (parse.isUnknown() || parse.isRuntime()) {
continue;
}
accepted.add(s);
}
}
for (String s : accepted) {
words.remove(s);
}
Path failReduced = DATA_PATH.resolve("fails-reduced.txt");
try (PrintWriter pw = new PrintWriter(failReduced.toFile(), "utf-8")) {
words.forEach(pw::println);
}
Log.info("Word count = %d Found = %d Not Found = %d", words.size(), accepted.size(), words.size() - accepted.size());
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class CategoryPredictionExperiment method generateSets.
private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useLemmas) throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
WebCorpus corpus = new WebCorpus("category", "category");
Log.info("Loading corpus from %s", input);
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
Histogram<String> categoryCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
String category = document.getCategory();
if (category.length() > 0) {
categoryCounts.add(category);
}
}
Log.info("All category count = %d", categoryCounts.size());
categoryCounts.removeSmaller(20);
for (String c : categoryCounts.getSortedList()) {
System.out.println(c + " " + categoryCounts.getCount(c));
}
Log.info("Reduced label count = %d", categoryCounts.size());
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
for (WebDocument document : corpus.getDocuments()) {
if (document.getCategory().length() == 0) {
continue;
}
if (useOnlyTitle && document.getTitle().length() == 0) {
continue;
}
String content = document.getContentAsString();
String title = document.getTitle();
List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
List<String> reduced = new ArrayList<>(docTokens.size());
String category = document.getCategory();
if (categoryCounts.contains(category)) {
category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
} else {
continue;
}
for (Token token : docTokens) {
if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
continue;
}
String tokenStr = token.getText();
reduced.add(tokenStr);
}
String join = String.join(" ", reduced);
if (join.trim().isEmpty()) {
continue;
}
if (useLemmas) {
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(join);
List<String> res = new ArrayList<>();
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
if (best.isUnknown()) {
res.add(e.getWordAnalysis().getInput());
continue;
}
List<String> lemmas = best.getLemmas();
if (lemmas.size() == 0) {
continue;
}
res.add(lemmas.get(lemmas.size() - 1));
}
join = String.join(" ", res);
}
set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
if (c++ % 1000 == 0) {
Log.info("%d of %d processed.", c, corpus.documentCount());
}
}
Log.info("Generate train and test set.");
saveSets(train, test, new LinkedHashSet<>(set));
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method readmeExample1.
@Test
@Ignore("Not a Test")
public void readmeExample1() throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
WordAnalysis results = morphology.analyze("kalemin");
results.forEach(s -> System.out.println(s.formatLong()));
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class CorpusNerCollector method main.
public static void main(String[] args) throws IOException {
Path corporaRoot = Paths.get("/media/ahmetaa/depo/corpora");
Path corpusDirList = corporaRoot.resolve("ner-list");
Path outRoot = Paths.get("/media/ahmetaa/depo/ner/out");
Files.createDirectories(outRoot);
BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, corpusDirList, 10_000);
// assumes you generated a model in my-model directory.
Path modelRoot = Paths.get("my-model");
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology);
Set<String> illegal = Sets.newHashSet(".", ",", "!", "?", ":");
List<String> lines = new ArrayList<>();
int c = 0;
int k = 0;
for (TextChunk chunk : corpusProvider) {
LinkedHashSet<String> sentences = new LinkedHashSet<>(TextCleaner.cleanAndExtractSentences(chunk.getData()));
for (String sentence : sentences) {
if (sentence.length() > 100) {
continue;
}
NerSentence result = ner.findNamedEntities(sentence);
int neCount = result.getNamedEntities().size();
List<NamedEntity> nes = result.getNamedEntities();
boolean badNamedEntity = false;
for (NamedEntity ne : nes) {
for (NerToken token : ne.tokens) {
if (illegal.contains(token.word)) {
badNamedEntity = true;
break;
}
WordAnalysis a = morphology.analyze(token.word);
for (SingleAnalysis analysis : a) {
DictionaryItem item = analysis.getDictionaryItem();
if (item.secondaryPos != SecondaryPos.Abbreviation && item.secondaryPos != SecondaryPos.ProperNoun) {
badNamedEntity = true;
break;
}
}
}
if (badNamedEntity) {
break;
}
}
if (badNamedEntity) {
continue;
}
if (neCount > 0 && neCount < 3) {
lines.add(result.getAsTrainingSentence(AnnotationStyle.BRACKET));
c++;
if (c == 1000) {
Path out = outRoot.resolve(chunk.id + "-" + k);
Files.write(out, lines);
Log.info("%s created. ", out);
lines = new ArrayList<>();
c = 0;
k++;
if (k > 10) {
System.exit(0);
}
}
}
}
}
}
Aggregations