Search in sources :

Example 21 with Lexer

use of org.antlr.v4.runtime.Lexer in project zemberek-nlp by ahmetaa.

the class CategoryPredictionExperiment method generateSets.

private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useRoots) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    TurkishSentenceAnalyzer analyzer = new TurkishSentenceAnalyzer(morphology, new Z3MarkovModelDisambiguator());
    WebCorpus corpus = new WebCorpus("category", "category");
    Log.info("Loading corpus from %s", input);
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    List<String> set = new ArrayList<>(corpus.documentCount());
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    Histogram<String> categoryCounts = new Histogram<>();
    for (WebDocument document : corpus.getDocuments()) {
        String category = document.getCategory();
        if (category.length() > 0) {
            categoryCounts.add(category);
        }
    }
    Log.info("All category count = %d", categoryCounts.size());
    categoryCounts.removeSmaller(20);
    Log.info("Reduced label count = %d", categoryCounts.size());
    Log.info("Extracting data from %d documents ", corpus.documentCount());
    int c = 0;
    for (WebDocument document : corpus.getDocuments()) {
        if (document.getCategory().length() == 0) {
            continue;
        }
        if (useOnlyTitle && document.getTitle().length() == 0) {
            continue;
        }
        String content = document.getContentAsString();
        String title = document.getTitle();
        List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
        List<String> reduced = new ArrayList<>(docTokens.size());
        String category = document.getCategory();
        if (categoryCounts.contains(category)) {
            category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
        } else {
            continue;
        }
        for (Token token : docTokens) {
            if (token.getType() == TurkishLexer.PercentNumeral || token.getType() == TurkishLexer.Number || token.getType() == TurkishLexer.Punctuation || token.getType() == TurkishLexer.RomanNumeral || token.getType() == TurkishLexer.Time || token.getType() == TurkishLexer.UnknownWord || token.getType() == TurkishLexer.Unknown) {
                continue;
            }
            String tokenStr = token.getText();
            reduced.add(tokenStr);
        }
        String join = String.join(" ", reduced);
        if (useRoots) {
            SentenceAnalysis analysis = analyzer.analyze(join);
            analyzer.disambiguate(analysis);
            List<String> res = new ArrayList<>();
            for (SentenceAnalysis.Entry e : analysis) {
                WordAnalysis best = e.parses.get(0);
                if (best.isUnknown()) {
                    res.add(e.input);
                    continue;
                }
                List<String> lemmas = best.getLemmas();
                if (lemmas.size() == 0) {
                    continue;
                }
                res.add(lemmas.get(lemmas.size() - 1));
            }
            join = String.join(" ", res);
        }
        set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
        if (c++ % 1000 == 0) {
            Log.info("%d of %d processed.", c, corpus.documentCount());
        }
    }
    Log.info("Generate train and test set.");
    saveSets(train, test, new LinkedHashSet<>(set));
}
Also used : Histogram(zemberek.core.collections.Histogram) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) Token(org.antlr.v4.runtime.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) WebDocument(zemberek.corpus.WebDocument) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) WebCorpus(zemberek.corpus.WebCorpus)

Example 22 with Lexer

use of org.antlr.v4.runtime.Lexer in project zemberek-nlp by ahmetaa.

the class TurkishTokenizationExample method tokenIterator.

public static void tokenIterator() {
    System.out.println("Low level tokenization iterator using Ant-lr Lexer.");
    String input = "İstanbul'a, merhaba!";
    System.out.println("Input = " + input);
    Iterator<Token> tokenIterator = tokenizer.getTokenIterator(input);
    while (tokenIterator.hasNext()) {
        Token token = tokenIterator.next();
        System.out.println("Token= " + token.getText() + " Type=" + token.getType());
    }
}
Also used : Token(org.antlr.v4.runtime.Token)

Example 23 with Lexer

use of org.antlr.v4.runtime.Lexer in project zemberek-nlp by ahmetaa.

the class TurkishTokenizer method getAllTokens.

private List<Token> getAllTokens(Lexer lexer) {
    List<Token> tokens = new ArrayList<>();
    for (Token token = lexer.nextToken(); token.getType() != Token.EOF; token = lexer.nextToken()) {
        int type = token.getType();
        if (typeIgnored(type)) {
            continue;
        }
        tokens.add(token);
    }
    return tokens;
}
Also used : ArrayList(java.util.ArrayList) Token(org.antlr.v4.runtime.Token)

Example 24 with Lexer

use of org.antlr.v4.runtime.Lexer in project JsoupXpath by zhegexiaohuozi.

the class JXDocument method selN.

public List<JXNode> selN(String xpath) throws XpathSyntaxErrorException {
    List<JXNode> finalRes = new LinkedList<>();
    try {
        CharStream input = CharStreams.fromString(xpath);
        XpathLexer lexer = new XpathLexer(input);
        CommonTokenStream tokens = new CommonTokenStream(lexer);
        XpathParser parser = new XpathParser(tokens);
        parser.setErrorHandler(new DoFailOnErrorHandler());
        ParseTree tree = parser.main();
        XpathProcessor processor = new XpathProcessor(elements);
        XValue calRes = processor.visit(tree);
        if (calRes.isElements()) {
            for (Element el : calRes.asElements()) {
                finalRes.add(JXNode.e(el));
            }
        } else if (calRes.isList()) {
            for (String str : calRes.asList()) {
                finalRes.add(JXNode.t(str));
            }
        }
    } catch (Exception e) {
        String msg = "Please check the syntax of your xpath expr, ";
        throw new XpathSyntaxErrorException(msg + ExceptionUtils.getRootCauseMessage(e), e);
    }
    return finalRes;
}
Also used : CommonTokenStream(org.antlr.v4.runtime.CommonTokenStream) XpathLexer(cn.wanghaomiao.xpath.antlr.XpathLexer) Element(org.jsoup.nodes.Element) XValue(cn.wanghaomiao.xpath.core.XValue) LinkedList(java.util.LinkedList) CharStream(org.antlr.v4.runtime.CharStream) DoFailOnErrorHandler(cn.wanghaomiao.xpath.exception.DoFailOnErrorHandler) XpathSyntaxErrorException(cn.wanghaomiao.xpath.exception.XpathSyntaxErrorException) XpathParser(cn.wanghaomiao.xpath.antlr.XpathParser) XpathSyntaxErrorException(cn.wanghaomiao.xpath.exception.XpathSyntaxErrorException) XpathProcessor(cn.wanghaomiao.xpath.core.XpathProcessor) ParseTree(org.antlr.v4.runtime.tree.ParseTree)

Example 25 with Lexer

use of org.antlr.v4.runtime.Lexer in project java by wavefrontHQ.

the class AbstractIngesterFormatter method getQueue.

protected Queue<Token> getQueue(String input) {
    DSWrapperLexer lexer = dsWrapperLexerThreadLocal.get();
    lexer.setInputStream(new ANTLRInputStream(input));
    CommonTokenStream commonTokenStream = new CommonTokenStream(lexer);
    commonTokenStream.fill();
    List<Token> tokens = commonTokenStream.getTokens();
    if (tokens.isEmpty()) {
        throw new RuntimeException("Could not parse: " + input);
    }
    // this is sensitive to the grammar in DSQuery.g4. We could just use the visitor but doing so
    // means we need to be creating the AST and instead we could just use the lexer. in any case,
    // we don't expect the graphite format to change anytime soon.
    // filter all EOF tokens first.
    Queue<Token> queue = tokens.stream().filter(t -> t.getType() != Lexer.EOF).collect(Collectors.toCollection(ArrayDeque::new));
    return queue;
}
Also used : ReportSourceTag(wavefront.report.ReportSourceTag) Token(org.antlr.v4.runtime.Token) CommonTokenStream(org.antlr.v4.runtime.CommonTokenStream) Supplier(java.util.function.Supplier) BaseErrorListener(org.antlr.v4.runtime.BaseErrorListener) ArrayList(java.util.ArrayList) Lexer(org.antlr.v4.runtime.Lexer) Lists(com.google.common.collect.Lists) DateUtils(org.apache.commons.lang.time.DateUtils) Matcher(java.util.regex.Matcher) Map(java.util.Map) ReportPoint(wavefront.report.ReportPoint) Nullable(javax.annotation.Nullable) ANTLRInputStream(org.antlr.v4.runtime.ANTLRInputStream) Recognizer(org.antlr.v4.runtime.Recognizer) Maps(com.google.common.collect.Maps) Collectors(java.util.stream.Collectors) DSWrapperLexer(queryserver.parser.DSWrapperLexer) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) RecognitionException(org.antlr.v4.runtime.RecognitionException) Preconditions(com.google.common.base.Preconditions) Queue(java.util.Queue) Pattern(java.util.regex.Pattern) ArrayDeque(java.util.ArrayDeque) Histogram(wavefront.report.Histogram) HistogramType(wavefront.report.HistogramType) CommonTokenStream(org.antlr.v4.runtime.CommonTokenStream) DSWrapperLexer(queryserver.parser.DSWrapperLexer) Token(org.antlr.v4.runtime.Token) ANTLRInputStream(org.antlr.v4.runtime.ANTLRInputStream)

Aggregations

Test (org.junit.Test)427 LexerGrammar (org.antlr.v4.tool.LexerGrammar)407 CommonTokenStream (org.antlr.v4.runtime.CommonTokenStream)279 ANTLRInputStream (org.antlr.v4.runtime.ANTLRInputStream)143 Grammar (org.antlr.v4.tool.Grammar)125 LexerInterpreter (org.antlr.v4.runtime.LexerInterpreter)108 CharStream (org.antlr.v4.runtime.CharStream)103 ParseTree (org.antlr.v4.runtime.tree.ParseTree)91 TokenStreamRewriter (org.antlr.v4.runtime.TokenStreamRewriter)86 ATN (org.antlr.v4.runtime.atn.ATN)56 IOException (java.io.IOException)45 BaseJavaTest (org.antlr.v4.test.runtime.java.BaseJavaTest)43 Token (org.antlr.v4.runtime.Token)41 ParseTreeWalker (org.antlr.v4.runtime.tree.ParseTreeWalker)39 ArrayList (java.util.ArrayList)37 RecognitionException (org.antlr.v4.runtime.RecognitionException)26 StringReader (java.io.StringReader)23 ParserRuleContext (org.antlr.v4.runtime.ParserRuleContext)23 TokenStream (org.antlr.v4.runtime.TokenStream)23 Lexer (org.antlr.v4.runtime.Lexer)22