use of edu.stanford.nlp.semgraph.semgrex.SemgrexPattern in project CoreNLP by stanfordnlp.
the class ScorePhrases method runParallelApplyPats.
private void runParallelApplyPats(Map<String, DataInstance> sents, String label, E pattern, TwoDimensionalCounter<CandidatePhrase, E> wordsandLemmaPatExtracted, CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat, Set<CandidatePhrase> alreadyLabeledWords) {
Redwood.log(Redwood.DBG, "Applying pattern " + pattern + " to a total of " + sents.size() + " sentences ");
List<String> notAllowedClasses = new ArrayList<>();
List<String> sentids = CollectionUtils.toList(sents.keySet());
if (constVars.doNotExtractPhraseAnyWordLabeledOtherClass) {
for (String l : constVars.getAnswerClass().keySet()) {
if (!l.equals(label)) {
notAllowedClasses.add(l);
}
}
notAllowedClasses.add("OTHERSEM");
}
Map<TokenSequencePattern, E> surfacePatternsLearnedThisIterConverted = null;
Map<SemgrexPattern, E> depPatternsLearnedThisIterConverted = null;
if (constVars.patternType.equals(PatternFactory.PatternType.SURFACE)) {
surfacePatternsLearnedThisIterConverted = new HashMap<>();
String patternStr = null;
try {
patternStr = pattern.toString(notAllowedClasses);
TokenSequencePattern pat = TokenSequencePattern.compile(constVars.env.get(label), patternStr);
surfacePatternsLearnedThisIterConverted.put(pat, pattern);
} catch (Exception e) {
log.info("Error applying pattern " + patternStr + ". Probably an ill formed pattern (can be because of special symbols in label names). Contact the software developer.");
throw e;
}
} else if (constVars.patternType.equals(PatternFactory.PatternType.DEP)) {
depPatternsLearnedThisIterConverted = new HashMap<>();
SemgrexPattern pat = SemgrexPattern.compile(pattern.toString(notAllowedClasses), new edu.stanford.nlp.semgraph.semgrex.Env(constVars.env.get(label).getVariables()));
depPatternsLearnedThisIterConverted.put(pat, pattern);
} else {
throw new UnsupportedOperationException();
}
// Apply the patterns and extract candidate phrases
int num;
int numThreads = constVars.numThreads;
// If number of sentences is less, do not create so many threads
if (sents.size() < 50)
numThreads = 1;
if (numThreads == 1)
num = sents.size();
else
num = sents.size() / (numThreads - 1);
ExecutorService executor = Executors.newFixedThreadPool(constVars.numThreads);
List<Future<Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>>>> list = new ArrayList<>();
for (int i = 0; i < numThreads; i++) {
Callable<Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>>> task = null;
if (pattern.type.equals(PatternFactory.PatternType.SURFACE))
// Redwood.log(Redwood.DBG, "Applying pats: assigning sentences " + i*num + " to " +Math.min(sentids.size(), (i + 1) * num) + " to thread " + (i+1));
task = new ApplyPatterns(sents, num == sents.size() ? sentids : sentids.subList(i * num, Math.min(sentids.size(), (i + 1) * num)), surfacePatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords, constVars);
else
task = new ApplyDepPatterns(sents, num == sents.size() ? sentids : sentids.subList(i * num, Math.min(sentids.size(), (i + 1) * num)), depPatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords, constVars);
Future<Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>>> submit = executor.submit(task);
list.add(submit);
}
// Now retrieve the result
for (Future<Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>>> future : list) {
try {
Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>> result = future.get();
Redwood.log(ConstantsAndVariables.extremedebug, "Pattern " + pattern + " extracted phrases " + result.first());
wordsandLemmaPatExtracted.addAll(result.first());
matchedTokensByPat.addAll(result.second());
alreadyLabeledWords.addAll(result.third());
} catch (Exception e) {
executor.shutdownNow();
throw new RuntimeException(e);
}
}
executor.shutdown();
}
use of edu.stanford.nlp.semgraph.semgrex.SemgrexPattern in project CoreNLP by stanfordnlp.
the class Mention method findDependentVerb.
private static Pair<IndexedWord, String> findDependentVerb(Mention m) {
if (m.dependency.getRoots().size() == 0) {
return new Pair<>();
}
// would be nice to condense this pattern, but sadly =reln
// always uses the last relation in the sequence, not the first
SemgrexPattern pattern = SemgrexPattern.compile("{idx:" + (m.headIndex + 1) + "} [ <=reln {tag:/^V.*/}=verb | <=reln ({} << {tag:/^V.*/}=verb) ]");
SemgrexMatcher matcher = pattern.matcher(m.dependency);
while (matcher.find()) {
return Pair.makePair(matcher.getNode("verb"), matcher.getRelnString("reln"));
}
return new Pair<>();
}
use of edu.stanford.nlp.semgraph.semgrex.SemgrexPattern in project CoreNLP by stanfordnlp.
the class Ssurgeon method ssurgeonPatternFromXML.
/**
* Given the root Element for a SemgrexPattern (SSURGEON_ELEM_TAG), converts
* it into its corresponding SemgrexPattern object.
* @throws Exception
*/
@SuppressWarnings("unchecked")
public static SsurgeonPattern ssurgeonPatternFromXML(Element elt) throws Exception {
String uid = getTagText(elt, SsurgeonPattern.UID_ELEM_TAG);
String notes = getTagText(elt, SsurgeonPattern.NOTES_ELEM_TAG);
String semgrexString = getTagText(elt, SsurgeonPattern.SEMGREX_ELEM_TAG);
SemgrexPattern semgrexPattern = SemgrexPattern.compile(semgrexString);
SsurgeonPattern retPattern = new SsurgeonPattern(uid, semgrexPattern);
retPattern.setNotes(notes);
NodeList editNodes = elt.getElementsByTagName(SsurgeonPattern.EDIT_LIST_ELEM_TAG);
for (int i = 0; i < editNodes.getLength(); i++) {
Node node = editNodes.item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element editElt = (Element) node;
String editVal = getEltText(editElt);
retPattern.addEdit(Ssurgeon.parseEditLine(editVal));
}
}
// If predicate available, parse
Element predElt = getFirstTag(elt, SsurgeonPattern.PREDICATE_TAG);
if (predElt != null) {
SsurgPred pred = assemblePredFromXML(getFirstChildElement(predElt));
retPattern.setPredicate(pred);
}
return retPattern;
}
use of edu.stanford.nlp.semgraph.semgrex.SemgrexPattern in project CoreNLP by stanfordnlp.
the class UniversalEnglishGrammaticalStructure method processNames.
/**
* Looks for NPs that should have the {@code name} relation and
* a) changes the structure such that the leftmost token becomes the head
* b) changes the relation from {@code compound} to {@code name}.
*
* Requires NER tags.
*
* @param sg A semantic graph.
*/
private static void processNames(SemanticGraph sg) {
if (!USE_NAME) {
return;
}
/* Semgrexes require a graph with a root. */
if (sg.getRoots().isEmpty()) {
return;
}
// check whether NER tags are available
IndexedWord rootToken = sg.getFirstRoot();
if (rootToken == null || !rootToken.containsKey(CoreAnnotations.NamedEntityTagAnnotation.class)) {
return;
}
SemanticGraph sgCopy = sg.makeSoftCopy();
for (SemgrexPattern pattern : NAME_PATTERNS) {
SemgrexMatcher matcher = pattern.matcher(sgCopy);
List<IndexedWord> nameParts = new ArrayList<>();
IndexedWord head = null;
while (matcher.find()) {
IndexedWord w1 = matcher.getNode("w1");
IndexedWord w2 = matcher.getNode("w2");
if (head != w1) {
if (head != null) {
processNamesHelper(sg, head, nameParts);
nameParts = new ArrayList<>();
}
head = w1;
}
if (w2.ner().equals(w1.ner())) {
nameParts.add(w2);
}
}
if (head != null) {
processNamesHelper(sg, head, nameParts);
sgCopy = sg.makeSoftCopy();
}
}
}
use of edu.stanford.nlp.semgraph.semgrex.SemgrexPattern in project CoreNLP by stanfordnlp.
the class CreateClauseDataset method subjectObjectPairs.
/**
* Create a dataset of subject/object pairs, such that a sequence of splits that segments this
* subject and object is a correct sequence.
*
* @param depparse The dependency parse of the sentence.
* @param traceTargets The set of spans corresponding to targets of traces.
* @param traceSources The set of indices in a sentence corresponding to the sources of traces.
* @return A dataset of subject/object spans.
*/
@SuppressWarnings("UnusedParameters")
private static Collection<Pair<Span, Span>> subjectObjectPairs(SemanticGraph depparse, List<CoreLabel> tokens, Map<Integer, Span> traceTargets, Map<Integer, Integer> traceSources) {
// log(StringUtils.join(tokens.stream().map(CoreLabel::word), " "));
List<Pair<Span, Span>> data = new ArrayList<>();
for (SemgrexPattern vpPattern : segmenter.VP_PATTERNS) {
SemgrexMatcher matcher = vpPattern.matcher(depparse);
while (matcher.find()) {
// Get the verb and object
IndexedWord verb = matcher.getNode("verb");
IndexedWord object = matcher.getNode("object");
if (verb != null && object != null) {
// See if there is already a subject attached
boolean hasSubject = false;
for (SemanticGraphEdge edge : depparse.outgoingEdgeIterable(verb)) {
if (edge.getRelation().toString().contains("subj")) {
hasSubject = true;
}
}
for (SemanticGraphEdge edge : depparse.outgoingEdgeIterable(object)) {
if (edge.getRelation().toString().contains("subj")) {
hasSubject = true;
}
}
if (!hasSubject) {
// Get the spans for the verb and object
Optional<List<IndexedWord>> verbChunk = segmenter.getValidChunk(depparse, verb, segmenter.VALID_ADVERB_ARCS, Optional.empty(), true);
Optional<List<IndexedWord>> objectChunk = segmenter.getValidChunk(depparse, object, segmenter.VALID_OBJECT_ARCS, Optional.empty(), true);
if (verbChunk.isPresent() && objectChunk.isPresent()) {
verbChunk.get().sort(Comparator.comparingInt(IndexedWord::index));
objectChunk.get().sort(Comparator.comparingInt(IndexedWord::index));
// Find a trace
int traceId = -1;
Span verbSpan = toSpan(verbChunk.get());
Span traceSpan = Span.fromValues(verbSpan.start() - 1, verbSpan.end() + 1);
for (Map.Entry<Integer, Integer> entry : traceSources.entrySet()) {
if (traceSpan.contains(entry.getValue())) {
traceId = entry.getKey();
}
}
// noinspection StatementWithEmptyBody
if (traceId < 0) {
// Register the VP as an unknown VP
// List<CoreLabel> vpChunk = new ArrayList<>();
// vpChunk.addAll(verbChunk.get());
// vpChunk.addAll(objectChunk.get());
// Collections.sort(vpChunk, (a, b) -> a.index() - b.index());
// debug("could not find trace for " + vpChunk);
} else {
// Add the obj chunk
Span subjectSpan = traceTargets.get(traceId);
Span objectSpan = toSpan(objectChunk.get());
if (subjectSpan != null) {
// debug("(" +
// StringUtils.join(tokens.subList(subjectSpan.start(), subjectSpan.end()).stream().map(CoreLabel::word), " ") + "; " +
// verb.word() + "; " +
// StringUtils.join(tokens.subList(objectSpan.start(), objectSpan.end()).stream().map(CoreLabel::word), " ") +
// ")");
data.add(Pair.makePair(subjectSpan, objectSpan));
}
}
}
}
}
}
}
// Run vanilla pattern splits
for (SemgrexPattern vpPattern : segmenter.VERB_PATTERNS) {
SemgrexMatcher matcher = vpPattern.matcher(depparse);
while (matcher.find()) {
// Get the verb and object
IndexedWord subject = matcher.getNode("subject");
IndexedWord object = matcher.getNode("object");
if (subject != null && object != null) {
Optional<List<IndexedWord>> subjectChunk = segmenter.getValidChunk(depparse, subject, segmenter.VALID_SUBJECT_ARCS, Optional.empty(), true);
Optional<List<IndexedWord>> objectChunk = segmenter.getValidChunk(depparse, object, segmenter.VALID_OBJECT_ARCS, Optional.empty(), true);
if (subjectChunk.isPresent() && objectChunk.isPresent()) {
Span subjectSpan = toSpan(subjectChunk.get());
Span objectSpan = toSpan(objectChunk.get());
data.add(Pair.makePair(subjectSpan, objectSpan));
}
}
}
}
return data;
}
Aggregations