use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishSentenceAnalyzerTest method doParseSentencesInCorpus.
private void doParseSentencesInCorpus(File ntvmsnbcCorpus) throws IOException {
List<String> sentences = SimpleTextReader.trimmingUTF8Reader(ntvmsnbcCorpus).asStringList();
Stopwatch sw = Stopwatch.createStarted();
long wc = 0;
int s = 0;
Histogram<String> unknownStuff = new Histogram<>();
for (String sentence : sentences) {
SentenceAnalysis parse = parser.analyze(sentence);
for (SentenceAnalysis.Entry entry : parse) {
List<WordAnalysis> parses = entry.parses;
for (WordAnalysis wordAnalysis : parses) {
if (wordAnalysis.dictionaryItem == DictionaryItem.UNKNOWN) {
unknownStuff.add(wordAnalysis.getSurfaceForm());
}
}
}
wc += parse.size();
// parser.disambiguate(parse);
s++;
if (s % 10000 == 0) {
System.out.println(s);
System.out.println(sw.elapsed(TimeUnit.MILLISECONDS) / 1000d);
}
}
try (PrintWriter pw = new PrintWriter("unknown.txt", "utf-8")) {
for (String s1 : unknownStuff.getSortedList()) {
pw.println(s1 + " " + unknownStuff.getCount(s1));
}
}
System.out.println("Word count = " + wc);
System.out.println("Elapsed Time =" + sw.elapsed(TimeUnit.MILLISECONDS));
System.out.println("Parse and disambiguate per second = " + (wc * 1000d) / (sw.elapsed(TimeUnit.MILLISECONDS)));
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class UnidentifiedTokenAnalyzerTest method shouldCreateUnidentifiedTokenParserSuccessfully.
@Test
public void shouldCreateUnidentifiedTokenParserSuccessfully() throws IOException {
TurkishMorphology parser = TurkishMorphology.createWithDefaults();
UnidentifiedTokenAnalyzer uiParser = new UnidentifiedTokenAnalyzer(parser);
List<WordAnalysis> results = uiParser.analyze("Ankara'ya");
for (WordAnalysis result : results) {
System.out.println(result);
}
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method testWordAnalysis.
@Test
@Ignore("Not a Test.")
public void testWordAnalysis() throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
List<WordAnalysis> results = morphology.analyze("phpye");
for (WordAnalysis result : results) {
Log.info(result.formatLong());
Log.info("\tStems = " + result.getStems());
Log.info("\tLemmas = " + result.getLemmas());
}
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class TurkishMorphology method analyzeWordsWithApostrophe.
private List<WordAnalysis> analyzeWordsWithApostrophe(String word) {
int index = word.indexOf('\'');
if (index >= 0) {
if (index == 0 || index == word.length() - 1) {
return Collections.emptyList();
}
StemAndEnding se = new StemAndEnding(word.substring(0, index), word.substring(index + 1));
String stem = TurkishAlphabet.INSTANCE.normalize(se.stem);
String withoutQuote = word.replaceAll("'", "");
List<WordAnalysis> noQuotesParses = wordAnalyzer.analyze(withoutQuote);
if (noQuotesParses.size() == 0) {
return Collections.emptyList();
}
return noQuotesParses.stream().filter(noQuotesParse -> noQuotesParse.getStems().contains(stem)).collect(Collectors.toList());
} else {
return Collections.emptyList();
}
}
use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.
the class UnidentifiedTokenAnalyzer method analyze.
public synchronized List<WordAnalysis> analyze(String word) {
if (word.contains("?")) {
return Collections.emptyList();
}
if (!Strings.containsNone(word, "0123456789")) {
return parseNumeral(word);
}
int index = word.indexOf('\'');
if (index >= 0) {
if (index == 0 || index == word.length() - 1) {
return Collections.emptyList();
}
StemAndEnding se = new StemAndEnding(word.substring(0, index), word.substring(index + 1));
String stem = TurkishAlphabet.INSTANCE.normalize(se.stem);
String ending = TurkishAlphabet.INSTANCE.normalize(se.ending);
String pronunciation = guessPronunciation(stem);
DictionaryItem itemProp = new DictionaryItem(Turkish.capitalize(stem), stem, pronunciation, PrimaryPos.Noun, SecondaryPos.ProperNoun);
itemProp.attributes.add(RootAttribute.Runtime);
graph.addDictionaryItem(itemProp);
String toParse = stem + ending;
List<WordAnalysis> properResults = parser.analyze(toParse);
graph.removeDictionaryItem(itemProp);
return properResults;
} else if (Character.isUpperCase(word.charAt(0))) {
String normalized = TurkishAlphabet.INSTANCE.normalize(word);
String pronunciation = guessPronunciation(normalized);
DictionaryItem itemProp = new DictionaryItem(Turkish.capitalize(normalized), normalized, pronunciation, PrimaryPos.Noun, SecondaryPos.ProperNoun);
itemProp.attributes.add(RootAttribute.Runtime);
graph.addDictionaryItem(itemProp);
// TODO eliminate gross code duplication
List<WordAnalysis> properResults = parser.analyze(normalized);
graph.removeDictionaryItem(itemProp);
return properResults;
}
return Collections.emptyList();
}
Aggregations