use of edu.isi.mavuno.util.ContextPatternWritable in project mavuno by metzlerd.
the class ChunkExtractor method addPair.
private void addPair(Text left, Text pattern, Text right) {
ContextPatternWritable c;
// forward pattern
c = new ContextPatternWritable();
c.setContext(MavunoUtils.createContext(left, right));
c.setPattern(pattern);
// add to chunk pairs
mChunkPairs.add(c);
// reverse pattern
c = new ContextPatternWritable();
c.setContext(MavunoUtils.createContext(right, left));
c.setPattern(REVERSE_PATTERN);
c.getPattern().append(pattern.getBytes(), 0, pattern.getLength());
// add to chunk pairs
mChunkPairs.add(c);
}
use of edu.isi.mavuno.util.ContextPatternWritable in project mavuno by metzlerd.
the class CooccurExtractor method getNextPair.
/* (non-Javadoc)
* @see edu.isi.mavuno.extract.Extractor#getNextPair(edu.isi.mavuno.util.ContextPatternWritable)
*/
@Override
public boolean getNextPair(ContextPatternWritable pair) {
if (mPairsIter != null && mPairsIter.hasNext()) {
ContextPatternWritable c = mPairsIter.next();
pair.setContext(c.getContext());
pair.setPattern(c.getPattern());
return true;
}
return false;
}
use of edu.isi.mavuno.util.ContextPatternWritable in project mavuno by metzlerd.
the class CooccurExtractor method loadPairs.
private void loadPairs(Text[] terms, String id) {
mPairs.clear();
ContextPatternWritable c;
for (int i = 0; i < terms.length; i++) {
for (int gramSize = 1; gramSize <= mMaxGramSize; gramSize++) {
if (!getPattern(mPattern, terms, i, gramSize)) {
continue;
}
c = new ContextPatternWritable();
c.setContext(id);
c.setPattern(mPattern);
mPairs.add(c);
}
}
}
use of edu.isi.mavuno.util.ContextPatternWritable in project mavuno by metzlerd.
the class DIRTExtractor method getContext.
private List<ContextPatternWritable> getContext(ArrayListOfInts leftPath, ArrayListOfInts rightPath, List<TratzParsedTokenWritable> tokens, Text[] chunks) {
//, int leftContextSize, int rightContextSize) {
// construct (context, pattern) pairs
List<ContextPatternWritable> contexts = new ArrayList<ContextPatternWritable>();
// make sure that the dimensions are feasible
if (leftPath.size() < 1 || rightPath.size() < 1) {
return contexts;
}
// make sure we don't split the left context's chunk
Text leftChunk = chunks[leftPath.get(0) - 1];
for (int i = 1; i <= leftPath.size() - 1; i++) {
if (chunks[leftPath.get(i) - 1].equals(leftChunk)) {
return contexts;
}
}
// make sure we don't split the right context's chunk
Text rightChunk = chunks[rightPath.get(0) - 1];
for (int i = rightPath.size() - 1; i >= 1; i--) {
if (chunks[rightPath.get(i) - 1].equals(rightChunk)) {
return contexts;
}
}
TratzParsedTokenWritable t;
Text term, posTag, dependType;
// construct pattern based on the parse tree path
final Text pattern = new Text();
// encode left context chunk type
// TODO: replace this with a more robust way of checking if this is an actual named entity or not
Text leftNETag = tokens.get(leftPath.get(0) - 1).getNETag();
Text leftChunkTag = tokens.get(leftPath.get(0) - 1).getChunkTag();
if (leftNETag.getLength() != 1 || (leftNETag.getLength() > 0 && leftNETag.getBytes()[0] != 'O')) {
pattern.append(leftNETag.getBytes(), 0, leftNETag.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
} else {
if (leftChunkTag.getLength() > 2) {
pattern.append(leftChunkTag.getBytes(), 2, leftChunkTag.getLength() - 2);
} else {
pattern.append(leftChunkTag.getBytes(), 0, leftChunkTag.getLength());
}
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
}
// left path portion of pattern
if (!mHeadOnly) {
for (int i = 0; i <= leftPath.size() - 2; i++) {
t = tokens.get(leftPath.get(i) - 1);
term = mUseLemmas ? t.getLemma() : t.getToken();
dependType = t.getDependType();
posTag = t.getPosTag();
if (i != 0) {
pattern.append(term.getBytes(), 0, term.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
}
pattern.append(dependType.getBytes(), 0, dependType.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
}
} else {
dependType = tokens.get(leftPath.get(0) - 1).getDependType();
posTag = tokens.get(leftPath.get(0) - 1).getPosTag();
pattern.append(dependType.getBytes(), 0, dependType.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
}
// root portion of pattern
if (leftPath.get(leftPath.size() - 1) != rightPath.get(rightPath.size() - 1)) {
throw new RuntimeException("Left and right paths do not share the same root! -- " + leftPath + " <--> " + rightPath);
}
t = tokens.get(leftPath.get(leftPath.size() - 1) - 1);
term = mUseLemmas ? t.getLemma() : t.getToken();
dependType = t.getDependType();
posTag = t.getPosTag();
pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(term.getBytes(), 0, term.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
// right path portion of pattern
if (!mHeadOnly) {
for (int i = rightPath.size() - 2; i >= 0; i--) {
t = tokens.get(rightPath.get(i) - 1);
term = mUseLemmas ? t.getLemma() : t.getToken();
dependType = t.getDependType();
posTag = t.getPosTag();
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(dependType.getBytes(), 0, dependType.getLength());
if (i != 0) {
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(term.getBytes(), 0, term.getLength());
}
}
} else {
dependType = tokens.get(rightPath.get(0) - 1).getDependType();
posTag = tokens.get(rightPath.get(0) - 1).getPosTag();
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(dependType.getBytes(), 0, dependType.getLength());
}
// encode right context chunk type
// TODO: replace this with a more robust way of checking if this is an actual named entity or not
Text rightNETag = tokens.get(rightPath.get(0) - 1).getNETag();
Text rightChunkTag = tokens.get(rightPath.get(0) - 1).getChunkTag();
if (rightNETag.getLength() != 1 || (rightNETag.getLength() > 0 && rightNETag.getBytes()[0] != 'O')) {
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
pattern.append(rightNETag.getBytes(), 0, rightNETag.getLength());
} else {
pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
if (rightChunkTag.getLength() > 2) {
pattern.append(rightChunkTag.getBytes(), 2, rightChunkTag.getLength() - 2);
} else {
pattern.append(rightChunkTag.getBytes(), 0, rightChunkTag.getLength());
}
}
if (mOrContextStyle) {
if (!mRightOnlyContextStyle) {
ContextPatternWritable c = new ContextPatternWritable();
c.setContext(MavunoUtils.createContext(leftChunk, MavunoUtils.ASTERISK));
c.setPattern(pattern);
contexts.add(c);
}
if (!mLeftOnlyContextStyle) {
ContextPatternWritable c = new ContextPatternWritable();
c.setContext(MavunoUtils.createContext(MavunoUtils.ASTERISK, rightChunk));
c.setPattern(pattern);
contexts.add(c);
}
} else {
ContextPatternWritable c = new ContextPatternWritable();
c.setContext(MavunoUtils.createContext(leftChunk, rightChunk));
c.setPattern(pattern);
contexts.add(c);
}
return contexts;
}
use of edu.isi.mavuno.util.ContextPatternWritable in project mavuno by metzlerd.
the class NAryChunkExtractor method loadChunkPairs.
private void loadChunkPairs() {
// clear chunk pairs
mChunkPairs.clear();
// get sentence
SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next();
// get chunk ids
List<TratzParsedTokenWritable> sentenceTokens = sentence.getTokens();
int[] chunkIds = NLProcTools.getChunkIds(sentenceTokens);
mChunks.clear();
mChunkTokens.clear();
// extract chunks from sentence
for (int i = 0; i < chunkIds.length; i++) {
if (i > 0 && chunkIds[i] != chunkIds[i - 1]) {
Chunk chunk = createChunk(mChunkTokens);
mChunks.add(chunk);
mChunkTokens.clear();
}
mChunkTokens.add(sentenceTokens.get(i));
}
// handle last chunk in sentence
if (mChunkTokens.size() > 0) {
Chunk chunk = createChunk(mChunkTokens);
mChunks.add(chunk);
}
// there's nothing we can do if there aren't at least mArity chunks in the sentence
if (mArity > mChunks.size()) {
mChunkPairsIter = mChunkPairs.iterator();
return;
}
// initialize context positions
for (int i = 0; i < mArity; i++) {
mContextPositions[i] = i;
}
// initialize pattern positions
for (int i = 0; i < mArity - 1; i++) {
mPatternPositions[i] = i;
}
// generate (context, pattern) pairs based on chunks
final Text basePattern = new Text();
while (true) {
// construct context
for (int i = 0; i < mArity; i++) {
mContextChunks[i] = mChunks.get(mContextPositions[i]);
}
// construct pattern
for (int i = 0; i < mArity - 1; i++) {
mPatternChunks[i] = mChunks.get(mPatternPositions[i]);
}
// add (context, pattern) pair
basePattern.clear();
for (int i = 0; i < mArity - 1; i++) {
// left chunk type
basePattern.append(mContextChunks[i].type.getBytes(), 0, mContextChunks[i].type.getLength());
basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
if (mContextPositions[i + 1] - mPatternPositions[i] > 1 || mPatternPositions[i] - mContextPositions[i] > 1) {
if (mPatternPositions[i] == mContextPositions[i]) {
basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
} else if (mPatternPositions[i] == mContextPositions[i] + 1) {
basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength());
basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
} else if (mPatternPositions[i] + 1 == mContextPositions[i + 1]) {
basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength());
} else {
basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength());
basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
}
} else if (mPatternPositions[i] == mContextPositions[i]) {
basePattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
} else {
basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength());
}
basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
}
// last chunk type
basePattern.append(mContextChunks[mArity - 1].type.getBytes(), 0, mContextChunks[mArity - 1].type.getLength());
basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
int[] indices;
mPermGen.reset();
while (mPermGen.hasMore()) {
// get next permutation
indices = mPermGen.getNext();
ContextPatternWritable c = new ContextPatternWritable();
// pattern
c.setPattern(basePattern);
Text numLeftText = new Text(mPermGen.getNumLeft() + "/" + mArity);
c.getPattern().append(numLeftText.getBytes(), 0, numLeftText.getLength());
// context
c.getContext().clear();
for (int i = 0; i < mArity; i++) {
c.getContext().append(mContextChunks[indices[i]].text.getBytes(), 0, mContextChunks[indices[i]].text.getLength());
if (i != mArity - 1) {
c.getContext().append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
}
}
// add to chunk pairs
mChunkPairs.add(c);
}
// get next set of context and pattern positions
int pos = mArity - 2;
while (pos >= 0) {
if (mPatternPositions[pos] + 1 < mChunks.size() && mPatternPositions[pos] + 1 < mContextPositions[pos + 1]) {
mPatternPositions[pos]++;
for (int i = pos + 1; i < mArity - 2; i++) {
mPatternPositions[i] = mContextPositions[i];
}
break;
}
pos--;
}
// update the context positions if the pattern positions can't be updated any further
if (pos < 0) {
pos = mArity - 1;
while (pos >= 0) {
if (mContextPositions[pos] + 1 < mChunks.size() && (pos + 1 >= mArity || mContextPositions[pos + 1] - (mContextPositions[pos] + 1) >= 1) && (pos <= 0 || mContextPositions[pos] - mContextPositions[pos - 1] - 1 <= mMaxSkipSize)) {
mContextPositions[pos]++;
if (pos < mArity - 1) {
mPatternPositions[pos] = mContextPositions[pos];
}
for (int i = pos + 1; i < mArity; i++) {
mContextPositions[i] = mContextPositions[pos] + (i - pos);
if (i < mArity - 1) {
mPatternPositions[i] = mContextPositions[i];
}
}
break;
}
pos--;
}
// if neither the context nor the pattern positions can be updated then we're done
if (pos < 0) {
// get iterator
mChunkPairsIter = mChunkPairs.iterator();
return;
}
}
}
}
Aggregations