use of edu.isi.mavuno.util.TratzParsedTokenWritable in project mavuno by metzlerd.
the class NLProcTools method getTaggedTokens.
public static List<TratzParsedTokenWritable> getTaggedTokens(List<Token> tokens, List<String> posTags, List<String> neTags) {
// make sure that # tokens = # pos tags = # ne tags
if (tokens.size() != posTags.size() || posTags.size() != neTags.size()) {
throw new IllegalArgumentException("Arguments must satisfy the condition: tokens.size() == posTags.size() == neTags.size()");
}
List<TratzParsedTokenWritable> taggedTokens = new ArrayList<TratzParsedTokenWritable>();
for (int i = 0; i < tokens.size(); i++) {
// construct tagged token
TratzParsedTokenWritable t = new TratzParsedTokenWritable();
t.setToken(tokens.get(i).getText());
t.setPosTag(posTags.get(i));
t.setNETag(neTags.get(i));
taggedTokens.add(t);
}
return taggedTokens;
}
use of edu.isi.mavuno.util.TratzParsedTokenWritable in project mavuno by metzlerd.
the class NLProcTools method getPOSSignature.
private static String getPOSSignature(List<TratzParsedTokenWritable> tokens, int chunkStart, int chunkEnd) {
String signature = "";
for (int i = chunkStart; i < chunkEnd; i++) {
TratzParsedTokenWritable t = tokens.get(i);
String term = t.getToken().toString();
String posTag = t.getPosTag().toString();
String neTag = t.getNETag().toString();
if (neTag.equals("LOCATION") || neTag.equals("PERSON") || neTag.equals("ORGANIZATION")) {
signature += "N";
} else if (posTag.startsWith("NN") || posTag.equals("VBG") || posTag.equals("POS") || posTag.equals("CD") || posTag.startsWith("PR") || posTag.startsWith("WP")) {
signature += "N";
} else if (posTag.startsWith("JJ")) {
signature += "A";
} else if (posTag.startsWith("IN") || posTag.startsWith("TO")) {
signature += "P";
} else if (term.equals(",") || term.equals("and")) {
signature += "C";
} else if (posTag.charAt(0) < 'A' || posTag.charAt(0) > 'Z') {
signature += "X";
} else if (posTag.equals("DT")) {
signature += "D";
} else {
signature += "O";
}
}
return signature;
}
use of edu.isi.mavuno.util.TratzParsedTokenWritable in project mavuno by metzlerd.
the class NLProcTools method extractMainChunk.
private static Text extractMainChunk(int start, int end, List<TratzParsedTokenWritable> tokens, boolean appendPOSTag) {
StringBuffer chunk = new StringBuffer();
String tag = "O";
boolean firstWord = true;
for (int i = start; i < end; i++) {
TratzParsedTokenWritable t = tokens.get(i);
String text = t.getToken().toString();
String neTag = t.getNETag().toString();
// exclude special characters and punctuation from the beginning and end of the chunk
if (isPunctuation(text)) {
continue;
}
// get (approximate) NE type
if (!"O".equals(neTag)) {
tag = neTag;
}
if (firstWord) {
firstWord = false;
} else {
chunk.append(' ');
}
chunk.append(text);
}
// return null if no valid terms are found
if (chunk.length() == 0) {
return null;
}
if (appendPOSTag) {
chunk.append("|");
chunk.append(tag);
}
return new Text(chunk.toString());
}
use of edu.isi.mavuno.util.TratzParsedTokenWritable in project mavuno by metzlerd.
the class DIRTExtractor method getContext.
private List<ContextPatternWritable> getContext(ArrayListOfInts leftPath, ArrayListOfInts rightPath, List<TratzParsedTokenWritable> tokens, Text[] chunks) {
//, int leftContextSize, int rightContextSize) {
// construct (context, pattern) pairs
List<ContextPatternWritable> contexts = new ArrayList<ContextPatternWritable>();
// make sure that the dimensions are feasible
if (leftPath.size() < 1 || rightPath.size() < 1) {
return contexts;
}
// make sure we don't split the left context's chunk
Text leftChunk = chunks[leftPath.get(0) - 1];
for (int i = 1; i <= leftPath.size() - 1; i++) {
if (chunks[leftPath.get(i) - 1].equals(leftChunk)) {
return contexts;
}
}
// make sure we don't split the right context's chunk
Text rightChunk = chunks[rightPath.get(0) - 1];
for (int i = rightPath.size() - 1; i >= 1; i--) {
if (chunks[rightPath.get(i) - 1].equals(rightChunk)) {
return contexts;
}
}
TratzParsedTokenWritable t;
Text term, posTag, dependType;
// construct pattern based on the parse tree path
final Text pattern = new Text();
// encode left context chunk type
// TODO: replace this with a more robust way of checking if this is an actual named entity or not
Text leftNETag = tokens.get(leftPath.get(0) - 1).getNETag();
Text leftChunkTag = tokens.get(leftPath.get(0) - 1).getChunkTag();
if (leftNETag.getLength() != 1 || (leftNETag.getLength() > 0 && leftNETag.getBytes()[0] != 'O')) {
pattern.append(leftNETag.getBytes(), 0, leftNETag.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
} else {
if (leftChunkTag.getLength() > 2) {
pattern.append(leftChunkTag.getBytes(), 2, leftChunkTag.getLength() - 2);
} else {
pattern.append(leftChunkTag.getBytes(), 0, leftChunkTag.getLength());
}
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
}
// left path portion of pattern
if (!mHeadOnly) {
for (int i = 0; i <= leftPath.size() - 2; i++) {
t = tokens.get(leftPath.get(i) - 1);
term = mUseLemmas ? t.getLemma() : t.getToken();
dependType = t.getDependType();
posTag = t.getPosTag();
if (i != 0) {
pattern.append(term.getBytes(), 0, term.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
}
pattern.append(dependType.getBytes(), 0, dependType.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
}
} else {
dependType = tokens.get(leftPath.get(0) - 1).getDependType();
posTag = tokens.get(leftPath.get(0) - 1).getPosTag();
pattern.append(dependType.getBytes(), 0, dependType.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
}
// root portion of pattern
if (leftPath.get(leftPath.size() - 1) != rightPath.get(rightPath.size() - 1)) {
throw new RuntimeException("Left and right paths do not share the same root! -- " + leftPath + " <--> " + rightPath);
}
t = tokens.get(leftPath.get(leftPath.size() - 1) - 1);
term = mUseLemmas ? t.getLemma() : t.getToken();
dependType = t.getDependType();
posTag = t.getPosTag();
pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(term.getBytes(), 0, term.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
// right path portion of pattern
if (!mHeadOnly) {
for (int i = rightPath.size() - 2; i >= 0; i--) {
t = tokens.get(rightPath.get(i) - 1);
term = mUseLemmas ? t.getLemma() : t.getToken();
dependType = t.getDependType();
posTag = t.getPosTag();
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(dependType.getBytes(), 0, dependType.getLength());
if (i != 0) {
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(term.getBytes(), 0, term.getLength());
}
}
} else {
dependType = tokens.get(rightPath.get(0) - 1).getDependType();
posTag = tokens.get(rightPath.get(0) - 1).getPosTag();
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(dependType.getBytes(), 0, dependType.getLength());
}
// encode right context chunk type
// TODO: replace this with a more robust way of checking if this is an actual named entity or not
Text rightNETag = tokens.get(rightPath.get(0) - 1).getNETag();
Text rightChunkTag = tokens.get(rightPath.get(0) - 1).getChunkTag();
if (rightNETag.getLength() != 1 || (rightNETag.getLength() > 0 && rightNETag.getBytes()[0] != 'O')) {
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(rightNETag.getBytes(), 0, rightNETag.getLength());
} else {
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
if (rightChunkTag.getLength() > 2) {
pattern.append(rightChunkTag.getBytes(), 2, rightChunkTag.getLength() - 2);
} else {
pattern.append(rightChunkTag.getBytes(), 0, rightChunkTag.getLength());
}
}
if (mOrContextStyle) {
if (!mRightOnlyContextStyle) {
ContextPatternWritable c = new ContextPatternWritable();
c.setContext(MavunoUtils.createContext(leftChunk, MavunoUtils.ASTERISK));
c.setPattern(pattern);
contexts.add(c);
}
if (!mLeftOnlyContextStyle) {
ContextPatternWritable c = new ContextPatternWritable();
c.setContext(MavunoUtils.createContext(MavunoUtils.ASTERISK, rightChunk));
c.setPattern(pattern);
contexts.add(c);
}
} else {
ContextPatternWritable c = new ContextPatternWritable();
c.setContext(MavunoUtils.createContext(leftChunk, rightChunk));
c.setPattern(pattern);
contexts.add(c);
}
return contexts;
}
use of edu.isi.mavuno.util.TratzParsedTokenWritable in project mavuno by metzlerd.
the class DIRTExtractor method loadDependPairs.
private void loadDependPairs() {
// clear dependency pairs
mDependPairs.clear();
// get sentence
SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next();
// get sentence tokens
List<TratzParsedTokenWritable> tokens = sentence.getTokens();
// get chunk ids
int[] chunkIds = NLProcTools.getChunkIds(tokens);
// get mapping from positions to chunks
Text[] chunks = new Text[tokens.size()];
Text curChunk = null;
for (int i = 0; i < tokens.size(); i++) {
Text text = tokens.get(i).getToken();
if (i == 0 || (i > 0 && chunkIds[i] != chunkIds[i - 1])) {
curChunk = new Text(text);
} else {
curChunk.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
curChunk.append(text.getBytes(), 0, text.getLength());
}
chunks[i] = curChunk;
}
// populate parse tree
ArrayListOfInts[] children = new ArrayListOfInts[tokens.size() + 1];
for (int i = 0; i < tokens.size() + 1; i++) {
children[i] = new ArrayListOfInts();
}
for (int i = 0; i < tokens.size(); i++) {
TratzParsedTokenWritable t = tokens.get(i);
// ignore punctuation
if (!t.getDependType().equals(PUNCTUATION_TYPE)) {
children[t.getDependIndex()].add(i + 1);
}
}
// extract (context, pattern) pairs from parse tree
for (int i = 0; i < children[0].size(); i++) {
extractPairs(children, children[0].get(i), tokens, chunks);
}
// get iterator
mDependPairsIter = mDependPairs.iterator();
}
Aggregations