use of com.graphaware.nlp.ml.pagerank.CoOccurrenceItem in project neo4j-nlp by graphaware.
the class TextRank method computeCooccurrent.
private Map<Long, Map<Long, CoOccurrenceItem>> computeCooccurrent(boolean fromDependencies, List<CoOccurrenceItem> prelim, Map<Long, List<Pair<Long, Long>>> neExp) {
Map<Long, Map<Long, CoOccurrenceItem>> results = new HashMap<>();
long neVisited = 0L;
for (CoOccurrenceItem it : prelim) {
Long tag1 = it.getSource();
Long tag2 = it.getDestination();
int tag1Start = it.getStartPositions().get(0).first().intValue();
int tag2Start = it.getStartPositions().get(0).second().intValue();
if (expandNEs && !fromDependencies) {
if (neExp.containsKey(tag1)) {
if (neVisited == 0L || neVisited != tag1.longValue()) {
connectTagsInNE(results, neExp.get(tag1), tag1Start);
neVisited = 0L;
}
tag1Start += neExp.get(tag1).get(neExp.get(tag1).size() - 1).first().intValue();
tag1 = neExp.get(tag1).get(neExp.get(tag1).size() - 1).second();
}
if (neExp.containsKey(tag2)) {
connectTagsInNE(results, neExp.get(tag2), tag2Start);
neVisited = tag2;
tag2 = neExp.get(tag2).get(0).second();
} else
neVisited = 0L;
}
addTagToCoOccurrence(results, tag1, tag1Start, tag2, tag2Start);
if (!directionsMatter) {
// when direction of co-occurrence relationships is not important
addTagToCoOccurrence(results, tag2, tag2Start, tag1, tag1Start);
}
}
return results;
}
use of com.graphaware.nlp.ml.pagerank.CoOccurrenceItem in project neo4j-nlp by graphaware.
the class TextRank method processOneAnnotatedText.
private void processOneAnnotatedText(String query, Map<String, Object> params, List<CoOccurrenceItem> prelim) {
int offsetStart = 0;
if (!prelim.isEmpty()) {
// find the last word position from previous annotated text and add some number so we can merge previous text with the coming one
offsetStart = prelim.get(prelim.size() - 1).getEndPositions().get(0).second().intValue() + 2;
LOG.debug("\n Processing another AnnotatedText. Offset: " + offsetStart);
}
Result res = null;
try (Transaction tx = database.beginTx()) {
res = database.execute(query, params);
tx.success();
} catch (Exception e) {
LOG.error("Error while creating co-occurrences: ", e);
}
while (res != null && res.hasNext()) {
Map<String, Object> next = res.next();
Long tag1 = toLong(next.get("tag1"));
Long tag2 = toLong(next.get("tag2"));
String tagVal1 = (String) next.get("tag1_id");
String tagVal2 = (String) next.get("tag2_id");
Long tag1Start = offsetStart + toLong(next.get("sourceStartPosition"));
Long tag2Start = offsetStart + toLong(next.get("destinationStartPosition"));
Long tag1End = offsetStart + toLong(next.get("sourceEndPosition"));
Long tag2End = offsetStart + toLong(next.get("destinationEndPosition"));
List<String> pos1 = next.get("pos1") != null ? Arrays.asList((String[]) next.get("pos1")) : new ArrayList<>();
List<String> pos2 = next.get("pos2") != null ? Arrays.asList((String[]) next.get("pos2")) : new ArrayList<>();
// check whether POS of both tags are admitted
boolean bPOS1 = pos1.stream().filter(pos -> admittedPOSs.contains(pos)).count() != 0 || pos1.size() == 0;
boolean bPOS2 = pos2.stream().filter(pos -> admittedPOSs.contains(pos)).count() != 0 || pos2.size() == 0;
// fill tag co-occurrences (adjacency matrix)
if (bPOS1 && bPOS2 && tagVal1 != null && tagVal2 != null) {
CoOccurrenceItem co = new CoOccurrenceItem(tag1, tag1Start.intValue(), tag2, tag2Start.intValue());
co.addEndPositions(tag1End.intValue(), tag2End.intValue());
prelim.add(co);
}
// for logging purposes and for `expandNamedEntities()`
if (tag1 != null)
idToValue.put(tag1, tagVal1);
if (tag2 != null)
idToValue.put(tag2, tagVal2);
}
}
use of com.graphaware.nlp.ml.pagerank.CoOccurrenceItem in project neo4j-nlp by graphaware.
the class TextRank method evaluate.
public TextRankResult evaluate(List<Node> annotatedTexts, String language, int iter, double damp, double threshold) {
Map<Long, Map<Long, CoOccurrenceItem>> coOccurrence = createCooccurrences(annotatedTexts, language, cooccurrencesFromDependencies);
if (coOccurrence == null) {
return TextRankResult.SUCCESS(new HashMap<>());
}
PageRank pageRank = new PageRank(database);
Map<Long, Double> pageRanks = pageRank.run(coOccurrence, iter, damp, threshold);
if (cooccurrencesFromDependencies) {
coOccurrence.clear();
// co-occurrences from natural word flow; needed for merging keywords into key phrases
coOccurrence = createCooccurrences(annotatedTexts, language, false);
}
if (pageRanks == null) {
LOG.error("Page ranks not retrieved, aborting evaluate() method ...");
return TextRankResult.FAILED("Page ranks not retrieved");
}
int n_oneThird = (int) (pageRanks.size() * topxTags);
List<Long> topThird = getTopX(pageRanks, n_oneThird);
LOG.info("Keyword candidates are top " + n_oneThird + " tags from this list:");
pageRanks.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).forEach(en -> LOG.debug(" " + idToValue.get(en.getKey()) + ": " + en.getValue()));
Map<String, Object> params = new HashMap<>();
params.put("posList", admittedPOSs);
params.put("stopwords", removeStopWords ? stopWords : new ArrayList<>());
// Detail tag analysis - get start & end positions and related tags (dependencies)
List<KeywordExtractedItem> keywordsOccurrences = new ArrayList<>();
Map<Long, KeywordExtractedItem> keywordMap = new HashMap<>();
List<Long> wrongNEs = new ArrayList<>();
for (Node node : annotatedTexts) {
params.put("id", node.getId());
detailedTagAnalysis(GET_TAG_QUERY, params, pageRanks, keywordsOccurrences, keywordMap, wrongNEs);
}
Map<String, Keyword> results = new HashMap<>();
while (!keywordsOccurrences.isEmpty()) {
final AtomicReference<KeywordExtractedItem> keywordOccurrence = new AtomicReference<>(keywordsOccurrences.remove(0));
final AtomicReference<String> currValue = new AtomicReference<>(keywordOccurrence.get().getValue());
final AtomicReference<Double> currRelevance = new AtomicReference<>(keywordOccurrence.get().getRelevance());
final AtomicReference<Integer> currNTopRated = new AtomicReference<>(0);
Set<Long> relTagIDs = getRelTagsIntoDepth(keywordOccurrence.get(), keywordsOccurrences);
// keep only those that are among top 1/3
relTagIDs.retainAll(topThird);
if (// if useDependencies==false, keep only those keywords that are among top 1/3
!useDependencies && !topThird.contains(keywordOccurrence.get().getTagId()))
continue;
if (useDependencies && !topThird.contains(keywordOccurrence.get().getTagId()) && relTagIDs.size() == 0)
continue;
Map<String, Keyword> localResults;
if (topThird.contains(keywordOccurrence.get().getTagId()))
currNTopRated.set(currNTopRated.get() + 1);
do {
int endPosition = keywordOccurrence.get().getEndPosition();
localResults = checkNextKeyword(keywordOccurrence.get(), coOccurrence, keywordMap);
if (localResults.size() > 0) {
keywordOccurrence.set(null);
localResults.entrySet().stream().forEach((item) -> {
KeywordExtractedItem nextKeyword = keywordsOccurrences.get(0);
if (nextKeyword != null && nextKeyword.getValue().equalsIgnoreCase(item.getKey()) && (topThird.contains(nextKeyword.getTagId()) || useDependencies) && // crucial condition for graphs from co-occurrences, but very useful also for graphs from dependencies
(nextKeyword.getStartPosition() - endPosition) == 1) {
String newCurrValue = currValue.get().trim().split("_")[0] + " " + item.getKey();
double newCurrRelevance = currRelevance.get() + item.getValue().getRelevance();
if (topThird.contains(nextKeyword.getTagId()))
currNTopRated.set(currNTopRated.get() + 1);
currValue.set(newCurrValue);
currRelevance.set(newCurrRelevance);
keywordOccurrence.set(nextKeyword);
keywordsOccurrences.remove(0);
}
});
}
} while (!localResults.isEmpty() && keywordOccurrence.get() != null);
if (currNTopRated.get() > 0)
addToResults(currValue.get(), currRelevance.get(), TFIDF_1_1, currNTopRated.get(), results, 1);
}
if (expandNEs) {
// add named entities that contain at least some of the top 1/3 of words
for (Long key : neExpanded.keySet()) {
if (neExpanded.get(key).stream().filter(v -> topThird.contains(v)).count() == 0)
continue;
if (wrongNEs.contains(key))
continue;
// .toLowerCase();
String keystr = idToValue.get(key);
double pr = pageRanks.containsKey(key) ? pageRanks.get(key) : 0.;
if (// set PageRank value of a NE to max value of PR of it's composite words
pr == 0.)
pr = (double) pageRanks.entrySet().stream().filter(en -> neExpanded.get(key).contains(en.getKey())).mapToDouble(en -> en.getValue()).max().orElse(0.);
addToResults(keystr, pr, TFIDF_1_1, (int) (neExpanded.get(key).stream().filter(v -> topThird.contains(v)).count()), results, 1);
}
}
computeTotalOccurrence(results);
if (cleanKeywords) {
results = cleanFinalKeywords(results, n_oneThird);
}
return TextRankResult.SUCCESS(results);
}
use of com.graphaware.nlp.ml.pagerank.CoOccurrenceItem in project neo4j-nlp by graphaware.
the class TextRank method addTagToCoOccurrence.
private void addTagToCoOccurrence(Map<Long, Map<Long, CoOccurrenceItem>> results, Long source, int sourceStartPosition, Long destination, int destinationStartPosition) {
Map<Long, CoOccurrenceItem> mapTag1;
if (!results.containsKey(source)) {
mapTag1 = new HashMap<>();
results.put(source, mapTag1);
} else {
mapTag1 = results.get(source);
}
if (mapTag1.containsKey(destination)) {
CoOccurrenceItem ccEntry = mapTag1.get(destination);
ccEntry.incCount();
ccEntry.addPositions(sourceStartPosition, destinationStartPosition);
} else {
mapTag1.put(destination, new CoOccurrenceItem(source, sourceStartPosition, destination, destinationStartPosition));
}
}
use of com.graphaware.nlp.ml.pagerank.CoOccurrenceItem in project neo4j-nlp by graphaware.
the class TextRankSummarizer method addToCoOccurrence.
private void addToCoOccurrence(Map<Long, Map<Long, CoOccurrenceItem>> results, Long source, Long destination, double w) {
Map<Long, CoOccurrenceItem> mapTag1;
if (!results.containsKey(source)) {
mapTag1 = new HashMap<>();
results.put(source, mapTag1);
} else {
mapTag1 = results.get(source);
}
if (mapTag1.containsKey(destination)) {
CoOccurrenceItem ccEntry = mapTag1.get(destination);
ccEntry.setCount(w);
} else {
mapTag1.put(destination, new CoOccurrenceItem(source, 0, destination, 0));
mapTag1.get(destination).setCount(w);
}
}
Aggregations