use of zemberek.tokenization.TurkishSentenceExtractor in project zemberek-nlp by ahmetaa.
the class SentenceBoundaryDetection method simpleSentenceBoundaryDetector.
public static void simpleSentenceBoundaryDetector() {
String input = "Prof. Dr. Veli Davul açıklama yaptı. Kimse %6.5 lik enflasyon oranını beğenmemiş!" + " Oysa maçta ikinci olmuştuk... Değil mi?";
System.out.println("Paragraph = " + input);
TurkishSentenceExtractor extractor = TurkishSentenceExtractor.DEFAULT;
List<String> sentences = extractor.fromParagraph(input);
System.out.println("Sentences:");
for (String sentence : sentences) {
System.out.println(sentence);
}
}
use of zemberek.tokenization.TurkishSentenceExtractor in project zemberek-nlp by ahmetaa.
the class SpellingCorpusProducer method extractSentences.
void extractSentences(Path input, Path output) throws IOException {
TurkishSentenceExtractor extractor = TurkishSentenceExtractor.DEFAULT;
WebCorpus wc = getWebCorpus(input);
Log.info("Processing documents.");
for (WebDocument doc : wc.getDocuments()) {
List<String> paragraphs = doc.getLines();
List<String> sentences = new ArrayList<>(paragraphs.size() * 5);
for (String paragraph : paragraphs) {
sentences.addAll(extractor.fromParagraph(paragraph));
}
// set new content.
doc.setContent(sentences);
}
Log.info("Saving corpus to %s", output);
wc.save(output, false);
}
use of zemberek.tokenization.TurkishSentenceExtractor in project zemberek-nlp by ahmetaa.
the class SpellCheckerPerformanceTests method correctWordFindingTest.
@Test
@Ignore(value = "Not a test.")
public void correctWordFindingTest() throws Exception {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
TurkishSentenceExtractor extractor = TurkishSentenceExtractor.DEFAULT;
TurkishTokenizer tokenizer = TurkishTokenizer.DEFAULT;
Path path = new File(Resources.getResource("spell-checker-test.txt").getFile()).toPath();
List<String> lines = Files.readAllLines(path);
List<String> sentences = extractor.fromParagraphs(lines);
Stopwatch sw = Stopwatch.createStarted();
Histogram<String> incorrectFound = new Histogram<>();
Histogram<String> correctFound = new Histogram<>();
for (String sentence : sentences) {
List<Token> tokens = tokenizer.tokenize(sentence);
for (Token token : tokens) {
String text = token.getText();
if (!spellChecker.check(text)) {
incorrectFound.add(text);
} else {
correctFound.add(text);
}
}
}
Log.info("Elapsed = %d", sw.elapsed(TimeUnit.MILLISECONDS));
Log.info("Incorrect (total/unique) = %d / %d", incorrectFound.totalCount(), incorrectFound.size());
Log.info("Correct (total/unique) = %d / %d", correctFound.totalCount(), correctFound.size());
incorrectFound.saveSortedByCounts(Paths.get("incorrect.txt"), " : ");
correctFound.saveSortedByCounts(Paths.get("correct.txt"), " : ");
/*
Path lmPath = Paths.get(ClassLoader.getSystemResource("lm-bigram.slm").toURI());
SmoothLm model = SmoothLm.builder(lmPath.toFile()).build();
*/
}
use of zemberek.tokenization.TurkishSentenceExtractor in project zemberek-nlp by ahmetaa.
the class SentenceBoundaryDetection method simpleSentenceBoundaryDetector.
public static void simpleSentenceBoundaryDetector() {
String input = "Prof. Dr. Veli Davul açıklama yaptı. Kimse %6.5 lik enflasyon oranını beğenmemiş!" + " Oysa maçta ikinci olmuştuk... Değil mi?";
Log.info("Paragraph = " + input);
TurkishSentenceExtractor extractor = TurkishSentenceExtractor.DEFAULT;
List<String> sentences = extractor.fromParagraph(input);
Log.info("Sentences:");
for (String sentence : sentences) {
Log.info(sentence);
}
}
use of zemberek.tokenization.TurkishSentenceExtractor in project zemberek-nlp by ahmetaa.
the class PreprocessingServiceImpl method extractSentences.
public void extractSentences(SentenceExtractionRequest request, StreamObserver<SentenceExtractionResponse> responseObserver) {
TurkishSentenceExtractor extractor = request.getDoNotSplitInDoubleQuotes() ? defaultExtractor : doubleQuoteIgnoreExtractor;
responseObserver.onNext(SentenceExtractionResponse.newBuilder().addAllSentences(extractor.fromDocument(request.getDocument())).build());
responseObserver.onCompleted();
}
Aggregations