use of zemberek.normalization.TurkishSentenceNormalizer in project zemberek-nlp by ahmetaa.
the class SentimentClassifier method main.
public static void main(String[] args) throws IOException {
SentimentClassifier experiment = new SentimentClassifier();
morphology = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).build();
Path dataRoot = Paths.get("/home/aaa/zemberek-data");
normalizer = new TurkishSentenceNormalizer(morphology, dataRoot.resolve("normalization"), dataRoot.resolve("lm/lm.2gram.slm"));
experiment.generateData();
List<String> trainRawLines = TextIO.loadLines(trainRaw);
List<String> testRawLines = TextIO.loadLines(testRaw);
Log.info("Train data:");
experiment.dataInfo(trainRawLines);
Log.info("Test data:");
experiment.dataInfo(testRawLines);
Path tokenizedTrain = t1out.resolve("train-tokenized");
Path tokenizedTest = t1out.resolve("test-tokenized");
experiment.generateSetTokenized(trainRawLines, tokenizedTrain);
experiment.generateSetTokenized(testRawLines, tokenizedTest);
experiment.evaluate(t1out, tokenizedTrain, tokenizedTest, "tokenized");
Path lemmaTrain = t1out.resolve("train-lemma");
Path lemmaTest = t1out.resolve("test-lemma");
experiment.generateSetWithLemmas(trainRawLines, lemmaTrain);
experiment.generateSetWithLemmas(testRawLines, lemmaTest);
experiment.evaluate(t1out, lemmaTrain, lemmaTest, "lemma");
Path splitTrain = t1out.resolve("train-split");
Path splitTest = t1out.resolve("test-split");
experiment.generateSetWithSplit(trainRawLines, splitTrain);
experiment.generateSetWithSplit(testRawLines, splitTest);
experiment.evaluate(t1out, splitTrain, splitTest, "split");
}
use of zemberek.normalization.TurkishSentenceNormalizer in project zemberek-nlp by ahmetaa.
the class ClassificationExperiment method main.
public static void main(String[] args) throws IOException {
ClassificationExperiment experiment = new ClassificationExperiment();
morphology = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).build();
Path dataRoot = Paths.get("/home/aaa/zemberek-data");
normalizer = new TurkishSentenceNormalizer(morphology, dataRoot.resolve("normalization"), dataRoot.resolve("lm/lm.2gram.slm"));
normalizer.setAlwaysApplyDeasciifier(true);
experiment.generateData(200);
List<String> trainRawLines = TextIO.loadLines(trainRaw);
List<String> testRawLines = TextIO.loadLines(testRaw);
countTokens(trainRaw, testRaw);
Log.info("Train data:");
experiment.dataInfo(trainRawLines);
Log.info("Test data:");
experiment.dataInfo(testRawLines);
/*
Path tokenizedTrain = t1out.resolve("train-tokenized");
Path tokenizedTest = t1out.resolve("test-tokenized");
experiment.generateSetTokenized(trainRawLines, tokenizedTrain);
experiment.generateSetTokenized(testRawLines, tokenizedTest);
countTokens(tokenizedTrain, tokenizedTest);
experiment.evaluate(t1out, tokenizedTrain, tokenizedTest, "tokenized");
Path lemmaTrain = t1out.resolve("train-lemma");
Path lemmaTest = t1out.resolve("test-lemma");
experiment.generateSetWithLemmas(trainRawLines, lemmaTrain);
experiment.generateSetWithLemmas(testRawLines, lemmaTest);
countTokens(lemmaTrain, lemmaTest);
experiment.evaluate(t1out, lemmaTrain, lemmaTest, "lemma");
*/
Path splitTrain = t1out.resolve("train-split");
Path splitTest = t1out.resolve("test-split");
experiment.generateSetWithSplit(trainRawLines, splitTrain);
experiment.generateSetWithSplit(testRawLines, splitTest);
countTokens(splitTrain, splitTest);
experiment.evaluate(t1out, splitTrain, splitTest, "split");
}
use of zemberek.normalization.TurkishSentenceNormalizer in project zemberek-nlp by ahmetaa.
the class NormalizeNoisyText method main.
public static void main(String[] args) throws IOException {
String[] examples = { "Yrn okua gidicem", "Tmm, yarin havuza giricem ve aksama kadar yaticam :)", "ah aynen ya annemde fark ettı siz evinizden cıkmayın diyo", "gercek mı bu? Yuh! Artık unutulması bile beklenmiyo", "Hayır hayat telaşm olmasa alacam buraları gökdelen dikicem.", "yok hocam kesınlıkle oyle birşey yok", "herseyi soyle hayatında olmaması gerek bence boyle ınsanların falan baskı yapıyosa" };
// change paths with your normalization data root folder and language model file paths.
// Example: https://drive.google.com/drive/folders/1tztjRiUs9BOTH-tb1v7FWyixl-iUpydW
// download lm and normalization folders to some local directory.
Path zemberekDataRoot = Paths.get("/home/aaa/zemberek-data");
Path lookupRoot = zemberekDataRoot.resolve("normalization");
Path lmPath = zemberekDataRoot.resolve("lm/lm.2gram.slm");
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
TurkishSentenceNormalizer normalizer = new TurkishSentenceNormalizer(morphology, lookupRoot, lmPath);
for (String example : examples) {
System.out.println(example);
System.out.println(normalizer.normalize(example));
System.out.println();
}
}
Aggregations