use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class FindPOS method test.
private void test(String s) {
System.out.println("Sentence = " + s);
SentenceAnalysis analysis = analyzer.analyze(s);
analyzer.disambiguate(analysis);
for (SentenceAnalysis.Entry entry : analysis) {
WordAnalysis wa = entry.parses.get(0);
Log.info("%s -> %s : %s ", entry.input, wa.dictionaryItem.primaryPos, wa.dictionaryItem.secondaryPos);
}
}
use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishSentenceAnalyzerTest method doParseSentencesInCorpus.
private void doParseSentencesInCorpus(File ntvmsnbcCorpus) throws IOException {
List<String> sentences = SimpleTextReader.trimmingUTF8Reader(ntvmsnbcCorpus).asStringList();
Stopwatch sw = Stopwatch.createStarted();
long wc = 0;
int s = 0;
Histogram<String> unknownStuff = new Histogram<>();
for (String sentence : sentences) {
SentenceAnalysis parse = parser.analyze(sentence);
for (SentenceAnalysis.Entry entry : parse) {
List<WordAnalysis> parses = entry.parses;
for (WordAnalysis wordAnalysis : parses) {
if (wordAnalysis.dictionaryItem == DictionaryItem.UNKNOWN) {
unknownStuff.add(wordAnalysis.getSurfaceForm());
}
}
}
wc += parse.size();
// parser.disambiguate(parse);
s++;
if (s % 10000 == 0) {
System.out.println(s);
System.out.println(sw.elapsed(TimeUnit.MILLISECONDS) / 1000d);
}
}
try (PrintWriter pw = new PrintWriter("unknown.txt", "utf-8")) {
for (String s1 : unknownStuff.getSortedList()) {
pw.println(s1 + " " + unknownStuff.getCount(s1));
}
}
System.out.println("Word count = " + wc);
System.out.println("Elapsed Time =" + sw.elapsed(TimeUnit.MILLISECONDS));
System.out.println("Parse and disambiguate per second = " + (wc * 1000d) / (sw.elapsed(TimeUnit.MILLISECONDS)));
}
use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method testSentenceAnalysis.
@Test
@Ignore("Not a Test.")
public void testSentenceAnalysis() throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Z3MarkovModelDisambiguator disambiguator = new Z3MarkovModelDisambiguator();
TurkishSentenceAnalyzer analyzer = new TurkishSentenceAnalyzer(morphology, disambiguator);
String sentence = "Kırmızı kalemi al.";
Log.info("Sentence = " + sentence);
SentenceAnalysis analysis = analyzer.analyze(sentence);
Log.info("Before disambiguation.");
writeParseResult(analysis);
Log.info("\nAfter disambiguation.");
analyzer.disambiguate(analysis);
writeParseResult(analysis);
}
use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class GenerateDataWithRules method extractHighlyAmbigiousWordSentences.
private void extractHighlyAmbigiousWordSentences(Path inputRoot, Path outRoot, int minCount, int wordCount) throws IOException {
List<Path> files = Files.walk(inputRoot, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
Histogram<WordAnalysis> wordAnalyses = new Histogram<>();
for (Path file : files) {
Log.info("Processing %s", file);
LinkedHashSet<String> sentences = getSentences(file);
List<List<String>> group = group(new ArrayList<>(sentences), 5000);
for (List<String> lines : group) {
Log.info("Collected %d words.", wordAnalyses.size());
LinkedHashSet<String> toProcess = getAccpetableSentences(lines);
for (String sentence : toProcess) {
try {
SentenceAnalysis sentenceAnalysis = morphology.analyzeAndDisambiguate(sentence);
for (SentenceWordAnalysis analysis : sentenceAnalysis) {
HashSet<String> stems = new HashSet<>(4);
for (SingleAnalysis s : analysis.getWordAnalysis()) {
stems.add(s.getStem());
if (stems.size() > minCount) {
wordAnalyses.add(analysis.getWordAnalysis());
break;
}
}
}
} catch (Exception e) {
Log.warn("Error in sentence %s", sentence);
}
}
}
if (wordAnalyses.size() > wordCount) {
break;
}
}
String s = inputRoot.toFile().getName();
Path amb = outRoot.resolve(s + "-amb.txt");
try (PrintWriter pwa = new PrintWriter(amb.toFile(), "utf-8")) {
for (WordAnalysis wa : wordAnalyses.getSortedList()) {
pwa.println(wa.getInput());
for (SingleAnalysis analysis : wa) {
pwa.println(analysis.formatLong());
}
pwa.println();
}
}
}
use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method disambiguationExample.
@Test
@Ignore("Not a Test")
public void disambiguationExample() throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
String sentence = "Yarın kar yağacak.";
System.out.println("Sentence = " + sentence);
List<WordAnalysis> analysis = morphology.analyzeSentence(sentence);
System.out.println("Before disambiguation.");
for (WordAnalysis entry : analysis) {
System.out.println("Word = " + entry.getInput());
for (SingleAnalysis single : entry) {
System.out.println(single.formatLong());
}
}
System.out.println("\nAfter disambiguation.");
SentenceAnalysis after = morphology.disambiguate(sentence, analysis);
after.bestAnalysis().forEach(s -> System.out.println(s.formatLong()));
}
Aggregations