Search in sources :

Example 1 with CoOccurrenceItem

use of com.graphaware.nlp.ml.pagerank.CoOccurrenceItem in project neo4j-nlp by graphaware.

the class TextRank method computeCooccurrent.

private Map<Long, Map<Long, CoOccurrenceItem>> computeCooccurrent(boolean fromDependencies, List<CoOccurrenceItem> prelim, Map<Long, List<Pair<Long, Long>>> neExp) {
    Map<Long, Map<Long, CoOccurrenceItem>> results = new HashMap<>();
    long neVisited = 0L;
    for (CoOccurrenceItem it : prelim) {
        Long tag1 = it.getSource();
        Long tag2 = it.getDestination();
        int tag1Start = it.getStartPositions().get(0).first().intValue();
        int tag2Start = it.getStartPositions().get(0).second().intValue();
        if (expandNEs && !fromDependencies) {
            if (neExp.containsKey(tag1)) {
                if (neVisited == 0L || neVisited != tag1.longValue()) {
                    connectTagsInNE(results, neExp.get(tag1), tag1Start);
                    neVisited = 0L;
                }
                tag1Start += neExp.get(tag1).get(neExp.get(tag1).size() - 1).first().intValue();
                tag1 = neExp.get(tag1).get(neExp.get(tag1).size() - 1).second();
            }
            if (neExp.containsKey(tag2)) {
                connectTagsInNE(results, neExp.get(tag2), tag2Start);
                neVisited = tag2;
                tag2 = neExp.get(tag2).get(0).second();
            } else
                neVisited = 0L;
        }
        addTagToCoOccurrence(results, tag1, tag1Start, tag2, tag2Start);
        if (!directionsMatter) {
            // when direction of co-occurrence relationships is not important
            addTagToCoOccurrence(results, tag2, tag2Start, tag1, tag1Start);
        }
    }
    return results;
}
Also used : CoOccurrenceItem(com.graphaware.nlp.ml.pagerank.CoOccurrenceItem)

Example 2 with CoOccurrenceItem

use of com.graphaware.nlp.ml.pagerank.CoOccurrenceItem in project neo4j-nlp by graphaware.

the class TextRank method processOneAnnotatedText.

private void processOneAnnotatedText(String query, Map<String, Object> params, List<CoOccurrenceItem> prelim) {
    int offsetStart = 0;
    if (!prelim.isEmpty()) {
        // find the last word position from previous annotated text and add some number so we can merge previous text with the coming one
        offsetStart = prelim.get(prelim.size() - 1).getEndPositions().get(0).second().intValue() + 2;
        LOG.debug("\n Processing another AnnotatedText. Offset: " + offsetStart);
    }
    Result res = null;
    try (Transaction tx = database.beginTx()) {
        res = database.execute(query, params);
        tx.success();
    } catch (Exception e) {
        LOG.error("Error while creating co-occurrences: ", e);
    }
    while (res != null && res.hasNext()) {
        Map<String, Object> next = res.next();
        Long tag1 = toLong(next.get("tag1"));
        Long tag2 = toLong(next.get("tag2"));
        String tagVal1 = (String) next.get("tag1_id");
        String tagVal2 = (String) next.get("tag2_id");
        Long tag1Start = offsetStart + toLong(next.get("sourceStartPosition"));
        Long tag2Start = offsetStart + toLong(next.get("destinationStartPosition"));
        Long tag1End = offsetStart + toLong(next.get("sourceEndPosition"));
        Long tag2End = offsetStart + toLong(next.get("destinationEndPosition"));
        List<String> pos1 = next.get("pos1") != null ? Arrays.asList((String[]) next.get("pos1")) : new ArrayList<>();
        List<String> pos2 = next.get("pos2") != null ? Arrays.asList((String[]) next.get("pos2")) : new ArrayList<>();
        // check whether POS of both tags are admitted
        boolean bPOS1 = pos1.stream().filter(pos -> admittedPOSs.contains(pos)).count() != 0 || pos1.size() == 0;
        boolean bPOS2 = pos2.stream().filter(pos -> admittedPOSs.contains(pos)).count() != 0 || pos2.size() == 0;
        // fill tag co-occurrences (adjacency matrix)
        if (bPOS1 && bPOS2 && tagVal1 != null && tagVal2 != null) {
            CoOccurrenceItem co = new CoOccurrenceItem(tag1, tag1Start.intValue(), tag2, tag2Start.intValue());
            co.addEndPositions(tag1End.intValue(), tag2End.intValue());
            prelim.add(co);
        }
        // for logging purposes and for `expandNamedEntities()`
        if (tag1 != null)
            idToValue.put(tag1, tagVal1);
        if (tag2 != null)
            idToValue.put(tag2, tagVal2);
    }
}
Also used : NLPManager(com.graphaware.nlp.NLPManager) java.util(java.util) DynamicConfiguration(com.graphaware.nlp.configuration.DynamicConfiguration) Log(org.neo4j.logging.Log) TfIdfObject(com.graphaware.nlp.domain.TfIdfObject) PageRank(com.graphaware.nlp.ml.pagerank.PageRank) Keyword(com.graphaware.nlp.domain.Keyword) Collectors(java.util.stream.Collectors) AtomicReference(java.util.concurrent.atomic.AtomicReference) CoOccurrenceItem(com.graphaware.nlp.ml.pagerank.CoOccurrenceItem) Relationships(com.graphaware.nlp.persistence.constants.Relationships) TypeConverter(com.graphaware.nlp.util.TypeConverter) Labels(com.graphaware.nlp.persistence.constants.Labels) org.neo4j.graphdb(org.neo4j.graphdb) LoggerFactory(com.graphaware.common.log.LoggerFactory) Pair(com.graphaware.common.util.Pair) PipelineSpecification(com.graphaware.nlp.dsl.request.PipelineSpecification) CoOccurrenceItem(com.graphaware.nlp.ml.pagerank.CoOccurrenceItem) TfIdfObject(com.graphaware.nlp.domain.TfIdfObject)

Example 3 with CoOccurrenceItem

use of com.graphaware.nlp.ml.pagerank.CoOccurrenceItem in project neo4j-nlp by graphaware.

the class TextRank method evaluate.

public TextRankResult evaluate(List<Node> annotatedTexts, String language, int iter, double damp, double threshold) {
    Map<Long, Map<Long, CoOccurrenceItem>> coOccurrence = createCooccurrences(annotatedTexts, language, cooccurrencesFromDependencies);
    if (coOccurrence == null) {
        return TextRankResult.SUCCESS(new HashMap<>());
    }
    PageRank pageRank = new PageRank(database);
    Map<Long, Double> pageRanks = pageRank.run(coOccurrence, iter, damp, threshold);
    if (cooccurrencesFromDependencies) {
        coOccurrence.clear();
        // co-occurrences from natural word flow; needed for merging keywords into key phrases
        coOccurrence = createCooccurrences(annotatedTexts, language, false);
    }
    if (pageRanks == null) {
        LOG.error("Page ranks not retrieved, aborting evaluate() method ...");
        return TextRankResult.FAILED("Page ranks not retrieved");
    }
    int n_oneThird = (int) (pageRanks.size() * topxTags);
    List<Long> topThird = getTopX(pageRanks, n_oneThird);
    LOG.info("Keyword candidates are top " + n_oneThird + " tags from this list:");
    pageRanks.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).forEach(en -> LOG.debug("   " + idToValue.get(en.getKey()) + ": " + en.getValue()));
    Map<String, Object> params = new HashMap<>();
    params.put("posList", admittedPOSs);
    params.put("stopwords", removeStopWords ? stopWords : new ArrayList<>());
    // Detail tag analysis - get start & end positions and related tags (dependencies)
    List<KeywordExtractedItem> keywordsOccurrences = new ArrayList<>();
    Map<Long, KeywordExtractedItem> keywordMap = new HashMap<>();
    List<Long> wrongNEs = new ArrayList<>();
    for (Node node : annotatedTexts) {
        params.put("id", node.getId());
        detailedTagAnalysis(GET_TAG_QUERY, params, pageRanks, keywordsOccurrences, keywordMap, wrongNEs);
    }
    Map<String, Keyword> results = new HashMap<>();
    while (!keywordsOccurrences.isEmpty()) {
        final AtomicReference<KeywordExtractedItem> keywordOccurrence = new AtomicReference<>(keywordsOccurrences.remove(0));
        final AtomicReference<String> currValue = new AtomicReference<>(keywordOccurrence.get().getValue());
        final AtomicReference<Double> currRelevance = new AtomicReference<>(keywordOccurrence.get().getRelevance());
        final AtomicReference<Integer> currNTopRated = new AtomicReference<>(0);
        Set<Long> relTagIDs = getRelTagsIntoDepth(keywordOccurrence.get(), keywordsOccurrences);
        // keep only those that are among top 1/3
        relTagIDs.retainAll(topThird);
        if (// if useDependencies==false, keep only those keywords that are among top 1/3
        !useDependencies && !topThird.contains(keywordOccurrence.get().getTagId()))
            continue;
        if (useDependencies && !topThird.contains(keywordOccurrence.get().getTagId()) && relTagIDs.size() == 0)
            continue;
        Map<String, Keyword> localResults;
        if (topThird.contains(keywordOccurrence.get().getTagId()))
            currNTopRated.set(currNTopRated.get() + 1);
        do {
            int endPosition = keywordOccurrence.get().getEndPosition();
            localResults = checkNextKeyword(keywordOccurrence.get(), coOccurrence, keywordMap);
            if (localResults.size() > 0) {
                keywordOccurrence.set(null);
                localResults.entrySet().stream().forEach((item) -> {
                    KeywordExtractedItem nextKeyword = keywordsOccurrences.get(0);
                    if (nextKeyword != null && nextKeyword.getValue().equalsIgnoreCase(item.getKey()) && (topThird.contains(nextKeyword.getTagId()) || useDependencies) && // crucial condition for graphs from co-occurrences, but very useful also for graphs from dependencies
                    (nextKeyword.getStartPosition() - endPosition) == 1) {
                        String newCurrValue = currValue.get().trim().split("_")[0] + " " + item.getKey();
                        double newCurrRelevance = currRelevance.get() + item.getValue().getRelevance();
                        if (topThird.contains(nextKeyword.getTagId()))
                            currNTopRated.set(currNTopRated.get() + 1);
                        currValue.set(newCurrValue);
                        currRelevance.set(newCurrRelevance);
                        keywordOccurrence.set(nextKeyword);
                        keywordsOccurrences.remove(0);
                    }
                });
            }
        } while (!localResults.isEmpty() && keywordOccurrence.get() != null);
        if (currNTopRated.get() > 0)
            addToResults(currValue.get(), currRelevance.get(), TFIDF_1_1, currNTopRated.get(), results, 1);
    }
    if (expandNEs) {
        // add named entities that contain at least some of the top 1/3 of words
        for (Long key : neExpanded.keySet()) {
            if (neExpanded.get(key).stream().filter(v -> topThird.contains(v)).count() == 0)
                continue;
            if (wrongNEs.contains(key))
                continue;
            // .toLowerCase();
            String keystr = idToValue.get(key);
            double pr = pageRanks.containsKey(key) ? pageRanks.get(key) : 0.;
            if (// set PageRank value of a NE to max value of PR of it's composite words
            pr == 0.)
                pr = (double) pageRanks.entrySet().stream().filter(en -> neExpanded.get(key).contains(en.getKey())).mapToDouble(en -> en.getValue()).max().orElse(0.);
            addToResults(keystr, pr, TFIDF_1_1, (int) (neExpanded.get(key).stream().filter(v -> topThird.contains(v)).count()), results, 1);
        }
    }
    computeTotalOccurrence(results);
    if (cleanKeywords) {
        results = cleanFinalKeywords(results, n_oneThird);
    }
    return TextRankResult.SUCCESS(results);
}
Also used : NLPManager(com.graphaware.nlp.NLPManager) java.util(java.util) DynamicConfiguration(com.graphaware.nlp.configuration.DynamicConfiguration) Log(org.neo4j.logging.Log) TfIdfObject(com.graphaware.nlp.domain.TfIdfObject) PageRank(com.graphaware.nlp.ml.pagerank.PageRank) Keyword(com.graphaware.nlp.domain.Keyword) Collectors(java.util.stream.Collectors) AtomicReference(java.util.concurrent.atomic.AtomicReference) CoOccurrenceItem(com.graphaware.nlp.ml.pagerank.CoOccurrenceItem) Relationships(com.graphaware.nlp.persistence.constants.Relationships) TypeConverter(com.graphaware.nlp.util.TypeConverter) Labels(com.graphaware.nlp.persistence.constants.Labels) org.neo4j.graphdb(org.neo4j.graphdb) LoggerFactory(com.graphaware.common.log.LoggerFactory) Pair(com.graphaware.common.util.Pair) PipelineSpecification(com.graphaware.nlp.dsl.request.PipelineSpecification) Keyword(com.graphaware.nlp.domain.Keyword) PageRank(com.graphaware.nlp.ml.pagerank.PageRank) AtomicReference(java.util.concurrent.atomic.AtomicReference) TfIdfObject(com.graphaware.nlp.domain.TfIdfObject)

Example 4 with CoOccurrenceItem

use of com.graphaware.nlp.ml.pagerank.CoOccurrenceItem in project neo4j-nlp by graphaware.

the class TextRank method addTagToCoOccurrence.

private void addTagToCoOccurrence(Map<Long, Map<Long, CoOccurrenceItem>> results, Long source, int sourceStartPosition, Long destination, int destinationStartPosition) {
    Map<Long, CoOccurrenceItem> mapTag1;
    if (!results.containsKey(source)) {
        mapTag1 = new HashMap<>();
        results.put(source, mapTag1);
    } else {
        mapTag1 = results.get(source);
    }
    if (mapTag1.containsKey(destination)) {
        CoOccurrenceItem ccEntry = mapTag1.get(destination);
        ccEntry.incCount();
        ccEntry.addPositions(sourceStartPosition, destinationStartPosition);
    } else {
        mapTag1.put(destination, new CoOccurrenceItem(source, sourceStartPosition, destination, destinationStartPosition));
    }
}
Also used : CoOccurrenceItem(com.graphaware.nlp.ml.pagerank.CoOccurrenceItem)

Example 5 with CoOccurrenceItem

use of com.graphaware.nlp.ml.pagerank.CoOccurrenceItem in project neo4j-nlp by graphaware.

the class TextRankSummarizer method addToCoOccurrence.

private void addToCoOccurrence(Map<Long, Map<Long, CoOccurrenceItem>> results, Long source, Long destination, double w) {
    Map<Long, CoOccurrenceItem> mapTag1;
    if (!results.containsKey(source)) {
        mapTag1 = new HashMap<>();
        results.put(source, mapTag1);
    } else {
        mapTag1 = results.get(source);
    }
    if (mapTag1.containsKey(destination)) {
        CoOccurrenceItem ccEntry = mapTag1.get(destination);
        ccEntry.setCount(w);
    } else {
        mapTag1.put(destination, new CoOccurrenceItem(source, 0, destination, 0));
        mapTag1.get(destination).setCount(w);
    }
}
Also used : CoOccurrenceItem(com.graphaware.nlp.ml.pagerank.CoOccurrenceItem)

Aggregations

CoOccurrenceItem (com.graphaware.nlp.ml.pagerank.CoOccurrenceItem)5 LoggerFactory (com.graphaware.common.log.LoggerFactory)2 Pair (com.graphaware.common.util.Pair)2 NLPManager (com.graphaware.nlp.NLPManager)2 DynamicConfiguration (com.graphaware.nlp.configuration.DynamicConfiguration)2 Keyword (com.graphaware.nlp.domain.Keyword)2 TfIdfObject (com.graphaware.nlp.domain.TfIdfObject)2 PipelineSpecification (com.graphaware.nlp.dsl.request.PipelineSpecification)2 PageRank (com.graphaware.nlp.ml.pagerank.PageRank)2 Labels (com.graphaware.nlp.persistence.constants.Labels)2 Relationships (com.graphaware.nlp.persistence.constants.Relationships)2 TypeConverter (com.graphaware.nlp.util.TypeConverter)2 java.util (java.util)2 AtomicReference (java.util.concurrent.atomic.AtomicReference)2 Collectors (java.util.stream.Collectors)2 org.neo4j.graphdb (org.neo4j.graphdb)2 Log (org.neo4j.logging.Log)2