use of zemberek.morphology.analysis.tr.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class TurkishSentenceAnalyzerTest method setUp.
@Before
public void setUp() throws Exception {
TurkishMorphology morphParser = TurkishMorphology.createWithDefaults();
parser = new TurkishSentenceAnalyzer(morphParser, new Z3MarkovModelDisambiguator());
}
use of zemberek.morphology.analysis.tr.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class UnidentifiedTokenAnalyzerTest method shouldParseSmallCaseProperNounsWithSingleQuote.
@Test
public void shouldParseSmallCaseProperNounsWithSingleQuote() throws IOException {
HashSet<String> expected = Sets.newHashSet("[(İstanbul:istanbul) (Noun,Prop;A3sg+P2sg:un+Nom)]", "[(İstanbul:istanbul) (Noun,Prop;A3sg+Pnon+Gen:un)]");
TurkishMorphology parser = TurkishMorphology.builder().addTextDictionaryResources("dev-lexicon.txt").build();
UnidentifiedTokenAnalyzer uiParser = new UnidentifiedTokenAnalyzer(parser);
List<WordAnalysis> results = uiParser.analyze("İstanbul'un");
Assert.assertEquals(2, results.size());
for (WordAnalysis result : results) {
Assert.assertTrue(expected.contains(result.formatLong()));
}
results = uiParser.analyze("istanbul'un");
Assert.assertEquals(2, results.size());
for (WordAnalysis result : results) {
Assert.assertTrue(expected.contains(result.formatLong()));
}
results = uiParser.analyze("Ankara'ya");
Assert.assertEquals(1, results.size());
Assert.assertEquals("[(Ankara:ankara) (Noun,Prop;A3sg+Pnon+Dat:ya)]", results.get(0).formatLong());
results = uiParser.analyze("ankara'ya");
Assert.assertEquals(1, results.size());
Assert.assertEquals("[(Ankara:ankara) (Noun,Prop;A3sg+Pnon+Dat:ya)]", results.get(0).formatLong());
// Karaman does not exist in dictionary
results = uiParser.analyze("Karaman");
Assert.assertEquals(1, results.size());
Assert.assertEquals("[(Karaman:karaman) (Noun,Prop;A3sg+Pnon+Nom)]", results.get(0).formatLong());
results = uiParser.analyze("karaman'a");
Assert.assertEquals(1, results.size());
Assert.assertEquals("[(Karaman:karaman) (Noun,Prop;A3sg+Pnon+Dat:a)]", results.get(0).formatLong());
results = uiParser.analyze("karaman");
Assert.assertEquals(0, results.size());
}
use of zemberek.morphology.analysis.tr.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method parseLargeVocabularyZemberekForMorfessor.
@Test
@Ignore("Not a Test.")
public void parseLargeVocabularyZemberekForMorfessor() throws IOException {
Path wordFreqFile = DATA_PATH.resolve("vocab.all.freq");
Path outDir = DATA_PATH.resolve("out");
Files.createDirectories(outDir);
TurkishMorphology parser = TurkishMorphology.createWithDefaults();
Log.info("Loading histogram.");
Histogram<String> histogram = Histogram.loadFromUtf8File(wordFreqFile, ' ');
histogram.removeSmaller(1000);
List<String> accepted = new ArrayList<>(histogram.size());
int c = 0;
for (String s : histogram) {
s = s.trim();
if (s.length() < 4) {
continue;
}
List<WordAnalysis> parses = parser.analyze(s);
if (parses.size() > 0 && parses.get(0).dictionaryItem.primaryPos != PrimaryPos.Unknown) {
LinkedHashSet<String> k = new LinkedHashSet<>(2);
for (WordAnalysis parse : parses) {
if (parse.dictionaryItem.lemma.length() > 1) {
String str = parse.root + " " + String.join(" ", parse.suffixSurfaceList()).replaceAll("[ ]+", " ").trim();
k.add(str);
}
}
String join = String.join(", ", k).trim();
if (!s.equals(join) && join.length() > 2) {
accepted.add(s + " " + join);
}
}
if (c > 0 && c % 10000 == 0) {
Log.info("Processed = " + c);
}
c++;
}
sortAndSave(outDir.resolve("morfessor-annotation.txt"), accepted);
}
use of zemberek.morphology.analysis.tr.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method generatorTest.
@Test
@Ignore("Not a Test.")
public void generatorTest() throws IOException {
TurkishMorphology parser = TurkishMorphology.createWithDefaults();
List<WordAnalysis> result = parser.analyze("besiciliği");
WordAnalysis first = result.get(0);
Log.info(first.inflectionalGroups);
}
use of zemberek.morphology.analysis.tr.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class PerceptronNer method main.
public static void main(String[] args) throws IOException {
// Path trainPath = Paths.get("experiment/src/main/resources/ner/reyyan.train.txt");
Path trainPath = Paths.get("experiment/src/main/resources/ner/NE-bracket.train.txt");
NerDataSet trainingSet = NerDataSet.loadBracketTurkishCorpus(trainPath);
new NerDataSet.Info(trainingSet).log();
// Path testPath = Paths.get("experiment/src/main/resources/ner/reyyan.test.txt");
Path testPath = Paths.get("experiment/src/main/resources/ner/NE-bracket.test.txt");
NerDataSet testSet = NerDataSet.loadBracketTurkishCorpus(testPath);
new NerDataSet.Info(testSet).log();
Gazetteers gazetteers = new Gazetteers(Paths.get("experiment/src/main/resources/ner/location-words.txt"), Paths.get("experiment/src/main/resources/ner/organization-words.txt"));
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Map<String, ClassWeights> model = PerceptronNer.train(morphology, gazetteers, trainingSet, testSet, 10, 0.1f);
PerceptronNer ner = new PerceptronNer(model, morphology, gazetteers);
Log.info("Testing %d sentences.", testSet.sentences.size());
NerDataSet testResult = ner.test(testSet);
testReport(testSet, testResult, Paths.get("experiment/src/main/resources/ner/test-result.txt"));
Log.info("Done.");
}
Aggregations