use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class SpeedTest method testNewsCorpus.
@Test
@Ignore(value = "Speed Test.")
public void testNewsCorpus() throws IOException {
// Path p = Paths.get("/media/aaa/Data/corpora/me-sentences/www.aljazeera.com.tr/2018-02-22");
Path p = Paths.get("src/test/resources/corpora/cnn-turk-10k");
List<String> sentences = getSentences(p);
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
Stopwatch sw = Stopwatch.createStarted();
int tokenCount = 0;
int noAnalysis = 0;
int sentenceCount = 0;
Histogram<String> failedWords = new Histogram<>(100000);
for (String sentence : sentences) {
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
for (Token token : tokens) {
if (token.getType() == Token.Type.Punctuation) {
continue;
}
tokenCount++;
WordAnalysis results = morphology.analyze(token.getText());
if (!results.isCorrect()) {
noAnalysis++;
failedWords.add(token.getText());
}
}
sentenceCount++;
if (sentenceCount % 2000 == 0) {
Log.info("%d tokens analyzed.", tokenCount);
}
}
double seconds = sw.stop().elapsed(TimeUnit.MILLISECONDS) / 1000d;
double speed = tokenCount / seconds;
double parseRatio = 100 - (noAnalysis * 100d / tokenCount);
Log.info("%nElapsed = %.2f seconds", seconds);
Log.info("%nToken Count (No Punc) = %d %nParse Ratio = %.4f%nSpeed = %.2f tokens/sec%n", tokenCount, parseRatio, speed);
Log.info("Saving Unknown Tokens");
failedWords.saveSortedByCounts(Paths.get("unk.freq"), " ");
failedWords.saveSortedByKeys(Paths.get("unk"), " ", Turkish.STRING_COMPARATOR_ASC);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class SpeedTest method main.
public static void main(String[] args) throws IOException {
Path p = Paths.get("morphology/src/test/resources/corpora/cnn-turk-10k");
TurkishMorphology analyzer = TurkishMorphology.createWithDefaults();
for (int i = 0; i < 10; i++) {
testForVisualVm(p, analyzer);
analyzer.invalidateCache();
System.in.read();
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class WordAnalysisSurfaceFormatterTest method formatVerbs.
@Test
public void formatVerbs() {
TurkishMorphology morphology = TurkishMorphology.builder().disableCache().setLexicon("olmak").build();
String[] inputs = { "olarak", "Olarak" };
String[] expected = { "olarak", "Olarak" };
check(morphology, inputs, expected, null);
// giving apostrophe should not effect the output.
check(morphology, inputs, expected, "'");
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class WordAnalysisSurfaceFormatterTest method formatKnownProperNouns.
@Test
public void formatKnownProperNouns() {
TurkishMorphology morphology = TurkishMorphology.builder().disableCache().setLexicon("Ankara", "Iphone [Pr:ayfon, A:LocaleEn]", "Google [Pr:gugıl]").build();
String[] inputs = { "ankarada", "ıphonumun", "googledan", "Iphone", "Google", "Googlesa" };
String[] expected = { "Ankara'da", "Iphone'umun", "Google'dan", "Iphone", "Google", "Google'sa" };
check(morphology, inputs, expected, "'");
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class PerceptronAmbiguityResolverEvaluation method main.
public static void main(String[] args) throws IOException {
Path root = Paths.get("/media/ahmetaa/depo/ambiguity");
List<Path> paths = Lists.newArrayList(Paths.get("data/gold/gold1.txt"), root.resolve("www.aljazeera.com.tr-rule-result.txt"), root.resolve("wowturkey.com-rule-result.txt"), root.resolve("open-subtitles-tr-2018-rule-result.txt"), root.resolve("sak.train"), root.resolve("www.haberturk.com-rule-result.txt"), root.resolve("www.cnnturk.com-rule-result.txt"));
Path dev = root.resolve("sak.dev");
Path model = Paths.get("morphology/src/main/resources/tr/ambiguity/model");
Path modelCompressed = Paths.get("morphology/src/main/resources/tr/ambiguity/model-compressed");
TurkishMorphology morphology = TurkishMorphology.create(RootLexicon.builder().addTextDictionaryResources("tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict", "tr/person-names.dict").build());
DataSet trainingSet = new DataSet();
for (Path path : paths) {
trainingSet.add(DataSet.load(path, morphology));
}
DataSet devSet = DataSet.load(dev, morphology);
PerceptronAmbiguityResolver resolver = new PerceptronAmbiguityResolverTrainer(morphology).train(trainingSet, devSet, 7);
Weights modelTrained = (Weights) resolver.getModel();
modelTrained.pruneNearZeroWeights();
modelTrained.saveAsText(model);
System.out.println("Load model and test");
PerceptronAmbiguityResolver resolverRead = PerceptronAmbiguityResolver.fromModelFile(model);
Path test = root.resolve("sak.test");
((Weights) resolverRead.getModel()).compress().serialize(modelCompressed);
PerceptronAmbiguityResolverTrainer.test(test, morphology, resolverRead);
System.out.println("Load compressed model and test");
PerceptronAmbiguityResolver comp = PerceptronAmbiguityResolver.fromModelFile(modelCompressed);
PerceptronAmbiguityResolverTrainer.test(test, morphology, comp);
}
Aggregations