use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class CorpusNerCollector method main.
public static void main(String[] args) throws IOException {
Path corporaRoot = Paths.get("/media/ahmetaa/depo/corpora");
Path corpusDirList = corporaRoot.resolve("ner-list");
Path outRoot = Paths.get("/media/ahmetaa/depo/ner/out");
Files.createDirectories(outRoot);
BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, corpusDirList, 10_000);
// assumes you generated a model in my-model directory.
Path modelRoot = Paths.get("my-model");
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology);
Set<String> illegal = Sets.newHashSet(".", ",", "!", "?", ":");
List<String> lines = new ArrayList<>();
int c = 0;
int k = 0;
for (TextChunk chunk : corpusProvider) {
LinkedHashSet<String> sentences = new LinkedHashSet<>(TextCleaner.cleanAndExtractSentences(chunk.getData()));
for (String sentence : sentences) {
if (sentence.length() > 100) {
continue;
}
NerSentence result = ner.findNamedEntities(sentence);
int neCount = result.getNamedEntities().size();
List<NamedEntity> nes = result.getNamedEntities();
boolean badNamedEntity = false;
for (NamedEntity ne : nes) {
for (NerToken token : ne.tokens) {
if (illegal.contains(token.word)) {
badNamedEntity = true;
break;
}
WordAnalysis a = morphology.analyze(token.word);
for (SingleAnalysis analysis : a) {
DictionaryItem item = analysis.getDictionaryItem();
if (item.secondaryPos != SecondaryPos.Abbreviation && item.secondaryPos != SecondaryPos.ProperNoun) {
badNamedEntity = true;
break;
}
}
}
if (badNamedEntity) {
break;
}
}
if (badNamedEntity) {
continue;
}
if (neCount > 0 && neCount < 3) {
lines.add(result.getAsTrainingSentence(AnnotationStyle.BRACKET));
c++;
if (c == 1000) {
Path out = outRoot.resolve(chunk.id + "-" + k);
Files.write(out, lines);
Log.info("%s created. ", out);
lines = new ArrayList<>();
c = 0;
k++;
if (k > 10) {
System.exit(0);
}
}
}
}
}
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class HunspellOperations method generateAnnotationFileSingleSplit.
private static void generateAnnotationFileSingleSplit(Path vocab) throws IOException {
List<String> words = Files.readAllLines(vocab, StandardCharsets.UTF_8);
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
List<String> annotations = new ArrayList<>();
for (String word : words) {
WordAnalysis analysis = morphology.analyze(word);
if (!analysis.isCorrect()) {
Log.warn("Cannot analyze %s", word);
continue;
}
LinkedHashSet<String> stemEndings = new LinkedHashSet<>();
for (SingleAnalysis s : analysis) {
if (s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun || s.getDictionaryItem().secondaryPos == SecondaryPos.Abbreviation) {
continue;
}
List<String> stems = s.getStems();
for (String stem : stems) {
String ending = word.substring(stem.length());
if (!(stem + ending).equals(word)) {
Log.warn("Stem + Ending %s+%s does not match word %s", stem, ending, word);
continue;
}
if (ending.length() > 0) {
stemEndings.add(word + " " + stem + " " + ending);
} else {
stemEndings.add(word + " " + stem);
}
}
}
annotations.add(String.join(",", stemEndings));
}
Files.write(Paths.get("data/vocabulary/annonations.txt"), annotations, StandardCharsets.UTF_8);
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class DataConverter method extract.
private static void extract(Path dataPath, Path output) throws IOException {
DataSet set = com.google.common.io.Files.asCharSource(dataPath.toFile(), Charsets.UTF_8).readLines(new DataSetLoader());
TurkishMorphology morphology = TurkishMorphology.create(RootLexicon.builder().addTextDictionaryResources("tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict", "tr/person-names.dict").build());
List<SentenceAnalysis> result = new ArrayList<>();
Histogram<String> parseFails = new Histogram<>();
for (SentenceData sentenceData : set) {
// System.out.println(sentenceData.correctParse);
List<String> tokens = Splitter.on(" ").splitToList(sentenceData.sentence());
if (tokens.size() == 0 || tokens.size() != sentenceData.correctParse.size()) {
continue;
}
List<SentenceWordAnalysis> correctList = new ArrayList<>();
for (int i = 0; i < tokens.size(); i++) {
String s = tokens.get(i);
String p = sentenceData.correctParse.get(i);
p = p.replaceAll("PCNom", "PCNOM");
p = p.replaceAll("Pnon|Nom", "");
p = p.replaceAll("\\+Pos\\+", "+");
p = p.replaceAll("\\+Pos\\^DB", "^DB");
p = p.replaceAll("[+]+", "+");
p = p.replaceAll("[+]$", "");
p = p.replaceAll("[+]\\^DB", "^DB");
p = p.replaceAll("[.]", "");
p = p.toLowerCase(Turkish.LOCALE);
p = p.replaceAll("adverb", "adv");
p = p.replaceAll("\\+cop\\+a3sg", "+a3sg+cop");
p = p.replaceAll("\\+Unable", "^DB+Verb+Able+Neg");
if (lookup.containsKey(p)) {
p = lookup.get(p);
}
WordAnalysis a = morphology.analyze(s);
if (!a.isCorrect()) {
break;
}
SingleAnalysis best = null;
for (SingleAnalysis analysis : a) {
String of = convert(analysis);
if (of.equals(p)) {
best = analysis;
break;
}
}
if (best == null) {
if (Character.isUpperCase(s.charAt(0)) && (p.contains("+noun") && !p.contains("prop"))) {
String pp = p.replaceFirst("\\+noun", "\\+noun+prop");
for (SingleAnalysis analysis : a) {
String of = convert(analysis);
if (of.equals(pp)) {
best = analysis;
break;
}
}
}
}
if (best == null) {
List<String> z = a.getAnalysisResults().stream().map(DataConverter::convert).collect(Collectors.toList());
parseFails.add(s + " " + p);
} else {
correctList.add(new SentenceWordAnalysis(best, a));
}
}
if (correctList.size() == tokens.size()) {
result.add(new SentenceAnalysis(sentenceData.sentence(), correctList));
}
}
Scripts.saveUnambiguous(result, output);
parseFails.removeSmaller(3);
parseFails.saveSortedByCounts(Paths.get("parse-fails.txt"), " ");
System.out.format("Full Sentence Match = %d in %d%n", result.size(), set.sentences.size());
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class WordHistogram method generateHistograms.
static void generateHistograms(List<String> paragraphs, Path outRoot) throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Histogram<String> roots = new Histogram<>(1000_000);
Histogram<String> words = new Histogram<>(1000_000);
int paragraphCounter = 0;
int sentenceCounter = 0;
int tokenCounter = 0;
for (String paragraph : paragraphs) {
List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph(paragraph);
sentenceCounter += sentences.size();
for (String sentence : sentences) {
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
tokenCounter += tokens.size();
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
for (SentenceWordAnalysis e : analysis) {
SingleAnalysis best = e.getBestAnalysis();
if (best.getPos() == PrimaryPos.Numeral || best.getPos() == PrimaryPos.Punctuation) {
continue;
}
if (best.isUnknown()) {
continue;
}
if (best.isRuntime() && !Strings.containsNone(e.getWordAnalysis().getInput(), "01234567890")) {
continue;
}
List<String> lemmas = best.getLemmas();
if (lemmas.size() == 0) {
continue;
}
roots.add(best.getDictionaryItem().lemma);
String w = e.getWordAnalysis().getInput();
if (best.getDictionaryItem().secondaryPos != SecondaryPos.ProperNoun) {
w = w.toLowerCase(Turkish.LOCALE);
} else {
w = Turkish.capitalize(w);
}
words.add(w);
}
}
paragraphCounter++;
if (paragraphCounter % 1000 == 0) {
System.out.println(paragraphCounter + " of " + paragraphs.size());
}
}
System.out.println("tokenCounter = " + tokenCounter);
System.out.println("sentenceCounter = " + sentenceCounter);
Files.createDirectories(outRoot);
roots.saveSortedByCounts(outRoot.resolve("roots.freq.txt"), " ");
roots.saveSortedByKeys(outRoot.resolve("roots.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
words.saveSortedByCounts(outRoot.resolve("words.freq.txt"), " ");
words.saveSortedByKeys(outRoot.resolve("words.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
words.removeSmaller(10);
words.saveSortedByCounts(outRoot.resolve("words10.freq.txt"), " ");
words.saveSortedByKeys(outRoot.resolve("words10.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
}
use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.
the class Scripts method saveUnambiguous.
public static void saveUnambiguous(List<String> sentences, TurkishMorphology morphology, Path out) throws IOException {
try (PrintWriter pwMorph = new PrintWriter(out.toFile(), "utf-8")) {
for (String sentence : sentences) {
SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
if (analysis.bestAnalysis().stream().anyMatch(SingleAnalysis::isUnknown)) {
continue;
}
pwMorph.format("S:%s%n", sentence);
for (SentenceWordAnalysis sw : analysis) {
WordAnalysis wa = sw.getWordAnalysis();
pwMorph.println(wa.getInput());
SingleAnalysis best = sw.getBestAnalysis();
for (SingleAnalysis singleAnalysis : wa) {
boolean isBest = singleAnalysis.equals(best);
if (wa.analysisCount() == 1) {
pwMorph.println(singleAnalysis.formatLong());
} else {
pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
}
}
}
pwMorph.println();
}
}
}
Aggregations