use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class ScorePhrasesLearnFeatWt method chooseUnknownPhrases.
Set<CandidatePhrase> chooseUnknownPhrases(DataInstance sent, Random random, double perSelect, Class positiveClass, String label, int maxNum) {
Set<CandidatePhrase> unknownSamples = new HashSet<>();
if (maxNum == 0)
return unknownSamples;
Function<CoreLabel, Boolean> acceptWord = coreLabel -> {
if (coreLabel.get(positiveClass).equals(label) || constVars.functionWords.contains(coreLabel.word()))
return false;
else
return true;
};
Random r = new Random(0);
List<Integer> lengths = new ArrayList<>();
for (int i = 1; i <= PatternFactory.numWordsCompoundMapped.get(label); i++) lengths.add(i);
int length = CollectionUtils.sample(lengths, r);
if (constVars.patternType.equals(PatternFactory.PatternType.DEP)) {
ExtractPhraseFromPattern extract = new ExtractPhraseFromPattern(true, length);
SemanticGraph g = ((DataInstanceDep) sent).getGraph();
Collection<CoreLabel> sampledHeads = CollectionUtils.sampleWithoutReplacement(sent.getTokens(), Math.min(maxNum, (int) (perSelect * sent.getTokens().size())), random);
//TODO: change this for more efficient implementation
List<String> textTokens = sent.getTokens().stream().map(x -> x.word()).collect(Collectors.toList());
for (CoreLabel l : sampledHeads) {
if (!acceptWord.apply(l))
continue;
IndexedWord w = g.getNodeByIndex(l.index());
List<String> outputPhrases = new ArrayList<>();
List<ExtractedPhrase> extractedPhrases = new ArrayList<>();
List<IntPair> outputIndices = new ArrayList<>();
extract.printSubGraph(g, w, new ArrayList<>(), textTokens, outputPhrases, outputIndices, new ArrayList<>(), new ArrayList<>(), false, extractedPhrases, null, acceptWord);
for (ExtractedPhrase p : extractedPhrases) {
unknownSamples.add(CandidatePhrase.createOrGet(p.getValue(), null, p.getFeatures()));
}
}
} else if (constVars.patternType.equals(PatternFactory.PatternType.SURFACE)) {
CoreLabel[] tokens = sent.getTokens().toArray(new CoreLabel[0]);
for (int i = 0; i < tokens.length; i++) {
if (random.nextDouble() < perSelect) {
int left = (int) ((length - 1) / 2.0);
int right = length - 1 - left;
String ph = "";
boolean haspositive = false;
for (int j = Math.max(0, i - left); j < tokens.length && j <= i + right; j++) {
if (tokens[j].get(positiveClass).equals(label)) {
haspositive = true;
break;
}
ph += " " + tokens[j].word();
}
ph = ph.trim();
if (!haspositive && !ph.trim().isEmpty() && !constVars.functionWords.contains(ph)) {
unknownSamples.add(CandidatePhrase.createOrGet(ph));
}
}
}
} else
throw new RuntimeException("not yet implemented");
return unknownSamples;
}
use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class ApplyDepPatterns method getMatchedTokensIndex.
private Collection<ExtractedPhrase> getMatchedTokensIndex(SemanticGraph graph, SemgrexPattern pattern, DataInstance sent, String label) {
//TODO: look at the ignoreCommonTags flag
ExtractPhraseFromPattern extract = new ExtractPhraseFromPattern(false, PatternFactory.numWordsCompoundMapped.get(label));
Collection<IntPair> outputIndices = new ArrayList<>();
boolean findSubTrees = true;
List<CoreLabel> tokensC = sent.getTokens();
//TODO: see if you can get rid of this (only used for matchedGraphs)
List<String> tokens = tokensC.stream().map(x -> x.word()).collect(Collectors.toList());
List<String> outputPhrases = new ArrayList<>();
List<ExtractedPhrase> extractedPhrases = new ArrayList<>();
Function<Pair<IndexedWord, SemanticGraph>, Counter<String>> extractFeatures = new Function<Pair<IndexedWord, SemanticGraph>, Counter<String>>() {
@Override
public Counter<String> apply(Pair<IndexedWord, SemanticGraph> indexedWordSemanticGraphPair) {
//TODO: make features;
Counter<String> feat = new ClassicCounter<>();
IndexedWord vertex = indexedWordSemanticGraphPair.first();
SemanticGraph graph = indexedWordSemanticGraphPair.second();
List<Pair<GrammaticalRelation, IndexedWord>> pt = graph.parentPairs(vertex);
for (Pair<GrammaticalRelation, IndexedWord> en : pt) {
feat.incrementCount("PARENTREL-" + en.first());
}
return feat;
}
};
extract.getSemGrexPatternNodes(graph, tokens, outputPhrases, outputIndices, pattern, findSubTrees, extractedPhrases, constVars.matchLowerCaseContext, matchingWordRestriction);
//System.out.println("extracted phrases are " + extractedPhrases + " and output indices are " + outputIndices);
return extractedPhrases;
}
use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class DepPatternFactory method getPatternsForAllPhrases.
static Map<Integer, Set<DepPattern>> getPatternsForAllPhrases(DataInstance sent, Set<CandidatePhrase> commonWords) {
SemanticGraph graph = ((DataInstanceDep) sent).getGraph();
Map<Integer, Set<DepPattern>> pats4Sent = new HashMap<>();
if (graph == null || graph.isEmpty()) {
System.out.println("graph is empty or null!");
return null;
}
Set<IndexedWord> allNodes;
try {
allNodes = graph.getLeafVertices();
} catch (IllegalArgumentException i) {
return null;
}
for (IndexedWord w : allNodes) {
//because index starts at 1!!!!
pats4Sent.put(w.index() - 1, getContext(w, graph, commonWords, sent));
}
return pats4Sent;
}
use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class ChineseGrammaticalStructure method collapsePrepAndPoss.
private static void collapsePrepAndPoss(Collection<TypedDependency> list) {
Collection<TypedDependency> newTypedDeps = new ArrayList<>();
// Construct a map from words to the set of typed
// dependencies in which the word appears as governor.
Map<IndexedWord, Set<TypedDependency>> map = Generics.newHashMap();
for (TypedDependency typedDep : list) {
if (!map.containsKey(typedDep.gov())) {
map.put(typedDep.gov(), Generics.<TypedDependency>newHashSet());
}
map.get(typedDep.gov()).add(typedDep);
}
for (TypedDependency td1 : list) {
if (td1.reln() != GrammaticalRelation.KILL) {
IndexedWord td1Dep = td1.dep();
String td1DepPOS = td1Dep.tag();
// find all other typedDeps having our dep as gov
Set<TypedDependency> possibles = map.get(td1Dep);
if (possibles != null) {
// look for the "second half"
for (TypedDependency td2 : possibles) {
// String td2DepPOS = td2Dep.parent().value();
if (td1.reln() == DEPENDENT && td2.reln() == DEPENDENT && td1DepPOS.equals("P")) {
GrammaticalRelation td3reln = ChineseGrammaticalRelations.valueOf(td1Dep.value());
if (td3reln == null) {
td3reln = GrammaticalRelation.valueOf(Language.Chinese, td1Dep.value());
}
TypedDependency td3 = new TypedDependency(td3reln, td1.gov(), td2.dep());
//log.info("adding: " + td3);
newTypedDeps.add(td3);
// remember these are "used up"
td1.setReln(GrammaticalRelation.KILL);
// remember these are "used up"
td2.setReln(GrammaticalRelation.KILL);
}
}
// longer appears. So, change its governor to 'drew'.
if (td1.reln().equals(GrammaticalRelation.KILL)) {
for (TypedDependency td2 : possibles) {
if (!td2.reln().equals(GrammaticalRelation.KILL)) {
//log.info("td1 & td2: " + td1 + " & " + td2);
td2.setGov(td1.gov());
}
}
}
}
}
}
// now copy remaining unkilled TDs from here to new
for (TypedDependency td : list) {
if (!td.reln().equals(GrammaticalRelation.KILL)) {
newTypedDeps.add(td);
}
}
// forget all (esp. killed) TDs
list.clear();
list.addAll(newTypedDeps);
}
use of edu.stanford.nlp.ling.IndexedWord in project CoreNLP by stanfordnlp.
the class UniversalEnglishGrammaticalStructure method collapseReferent.
/**
* This method will collapse a referent relation such as follows. e.g.:
* "The man that I love ... " ref(man, that) dobj(love, that) -> ref(man, that) dobj(love,
* man)
*/
private static void collapseReferent(SemanticGraph sg) {
// find typed deps of form ref(gov, dep)
// put them in a List for processing
List<SemanticGraphEdge> refs = new ArrayList<>(sg.findAllRelns(REFERENT));
SemanticGraph sgCopy = sg.makeSoftCopy();
// now substitute target of referent where possible
for (SemanticGraphEdge ref : refs) {
// take the relative word
IndexedWord dep = ref.getDependent();
// take the antecedent
IndexedWord ant = ref.getGovernor();
for (Iterator<SemanticGraphEdge> iter = sgCopy.incomingEdgeIterator(dep); iter.hasNext(); ) {
SemanticGraphEdge edge = iter.next();
// disconnected) [cdm Jan 2010]
if (edge.getRelation() != REFERENT && !edge.getGovernor().equals(ant)) {
sg.removeEdge(edge);
sg.addEdge(edge.getGovernor(), ant, edge.getRelation(), Double.NEGATIVE_INFINITY, true);
}
}
}
}
Aggregations