use of org.antlr.v4.runtime.Lexer in project zemberek-nlp by ahmetaa.
the class CategoryPredictionExperiment method generateSets.
private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useRoots) throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
TurkishSentenceAnalyzer analyzer = new TurkishSentenceAnalyzer(morphology, new Z3MarkovModelDisambiguator());
WebCorpus corpus = new WebCorpus("category", "category");
Log.info("Loading corpus from %s", input);
corpus.addDocuments(WebCorpus.loadDocuments(input));
List<String> set = new ArrayList<>(corpus.documentCount());
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
Histogram<String> categoryCounts = new Histogram<>();
for (WebDocument document : corpus.getDocuments()) {
String category = document.getCategory();
if (category.length() > 0) {
categoryCounts.add(category);
}
}
Log.info("All category count = %d", categoryCounts.size());
categoryCounts.removeSmaller(20);
Log.info("Reduced label count = %d", categoryCounts.size());
Log.info("Extracting data from %d documents ", corpus.documentCount());
int c = 0;
for (WebDocument document : corpus.getDocuments()) {
if (document.getCategory().length() == 0) {
continue;
}
if (useOnlyTitle && document.getTitle().length() == 0) {
continue;
}
String content = document.getContentAsString();
String title = document.getTitle();
List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
List<String> reduced = new ArrayList<>(docTokens.size());
String category = document.getCategory();
if (categoryCounts.contains(category)) {
category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
} else {
continue;
}
for (Token token : docTokens) {
if (token.getType() == TurkishLexer.PercentNumeral || token.getType() == TurkishLexer.Number || token.getType() == TurkishLexer.Punctuation || token.getType() == TurkishLexer.RomanNumeral || token.getType() == TurkishLexer.Time || token.getType() == TurkishLexer.UnknownWord || token.getType() == TurkishLexer.Unknown) {
continue;
}
String tokenStr = token.getText();
reduced.add(tokenStr);
}
String join = String.join(" ", reduced);
if (useRoots) {
SentenceAnalysis analysis = analyzer.analyze(join);
analyzer.disambiguate(analysis);
List<String> res = new ArrayList<>();
for (SentenceAnalysis.Entry e : analysis) {
WordAnalysis best = e.parses.get(0);
if (best.isUnknown()) {
res.add(e.input);
continue;
}
List<String> lemmas = best.getLemmas();
if (lemmas.size() == 0) {
continue;
}
res.add(lemmas.get(lemmas.size() - 1));
}
join = String.join(" ", res);
}
set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
if (c++ % 1000 == 0) {
Log.info("%d of %d processed.", c, corpus.documentCount());
}
}
Log.info("Generate train and test set.");
saveSets(train, test, new LinkedHashSet<>(set));
}
use of org.antlr.v4.runtime.Lexer in project zemberek-nlp by ahmetaa.
the class TurkishTokenizationExample method tokenIterator.
public static void tokenIterator() {
System.out.println("Low level tokenization iterator using Ant-lr Lexer.");
String input = "İstanbul'a, merhaba!";
System.out.println("Input = " + input);
Iterator<Token> tokenIterator = tokenizer.getTokenIterator(input);
while (tokenIterator.hasNext()) {
Token token = tokenIterator.next();
System.out.println("Token= " + token.getText() + " Type=" + token.getType());
}
}
use of org.antlr.v4.runtime.Lexer in project zemberek-nlp by ahmetaa.
the class TurkishTokenizer method getAllTokens.
private List<Token> getAllTokens(Lexer lexer) {
List<Token> tokens = new ArrayList<>();
for (Token token = lexer.nextToken(); token.getType() != Token.EOF; token = lexer.nextToken()) {
int type = token.getType();
if (typeIgnored(type)) {
continue;
}
tokens.add(token);
}
return tokens;
}
use of org.antlr.v4.runtime.Lexer in project JsoupXpath by zhegexiaohuozi.
the class JXDocument method selN.
public List<JXNode> selN(String xpath) throws XpathSyntaxErrorException {
List<JXNode> finalRes = new LinkedList<>();
try {
CharStream input = CharStreams.fromString(xpath);
XpathLexer lexer = new XpathLexer(input);
CommonTokenStream tokens = new CommonTokenStream(lexer);
XpathParser parser = new XpathParser(tokens);
parser.setErrorHandler(new DoFailOnErrorHandler());
ParseTree tree = parser.main();
XpathProcessor processor = new XpathProcessor(elements);
XValue calRes = processor.visit(tree);
if (calRes.isElements()) {
for (Element el : calRes.asElements()) {
finalRes.add(JXNode.e(el));
}
} else if (calRes.isList()) {
for (String str : calRes.asList()) {
finalRes.add(JXNode.t(str));
}
}
} catch (Exception e) {
String msg = "Please check the syntax of your xpath expr, ";
throw new XpathSyntaxErrorException(msg + ExceptionUtils.getRootCauseMessage(e), e);
}
return finalRes;
}
use of org.antlr.v4.runtime.Lexer in project java by wavefrontHQ.
the class AbstractIngesterFormatter method getQueue.
protected Queue<Token> getQueue(String input) {
DSWrapperLexer lexer = dsWrapperLexerThreadLocal.get();
lexer.setInputStream(new ANTLRInputStream(input));
CommonTokenStream commonTokenStream = new CommonTokenStream(lexer);
commonTokenStream.fill();
List<Token> tokens = commonTokenStream.getTokens();
if (tokens.isEmpty()) {
throw new RuntimeException("Could not parse: " + input);
}
// this is sensitive to the grammar in DSQuery.g4. We could just use the visitor but doing so
// means we need to be creating the AST and instead we could just use the lexer. in any case,
// we don't expect the graphite format to change anytime soon.
// filter all EOF tokens first.
Queue<Token> queue = tokens.stream().filter(t -> t.getType() != Lexer.EOF).collect(Collectors.toCollection(ArrayDeque::new));
return queue;
}
Aggregations