Search in sources :

Example 6 with TratzParsedTokenWritable

use of edu.isi.mavuno.util.TratzParsedTokenWritable in project mavuno by metzlerd.

the class NAryChunkExtractor method loadChunkPairs.

private void loadChunkPairs() {
    // clear chunk pairs
    mChunkPairs.clear();
    // get sentence
    SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next();
    // get chunk ids
    List<TratzParsedTokenWritable> sentenceTokens = sentence.getTokens();
    int[] chunkIds = NLProcTools.getChunkIds(sentenceTokens);
    mChunks.clear();
    mChunkTokens.clear();
    // extract chunks from sentence
    for (int i = 0; i < chunkIds.length; i++) {
        if (i > 0 && chunkIds[i] != chunkIds[i - 1]) {
            Chunk chunk = createChunk(mChunkTokens);
            mChunks.add(chunk);
            mChunkTokens.clear();
        }
        mChunkTokens.add(sentenceTokens.get(i));
    }
    // handle last chunk in sentence
    if (mChunkTokens.size() > 0) {
        Chunk chunk = createChunk(mChunkTokens);
        mChunks.add(chunk);
    }
    // there's nothing we can do if there aren't at least mArity chunks in the sentence
    if (mArity > mChunks.size()) {
        mChunkPairsIter = mChunkPairs.iterator();
        return;
    }
    // initialize context positions
    for (int i = 0; i < mArity; i++) {
        mContextPositions[i] = i;
    }
    // initialize pattern positions
    for (int i = 0; i < mArity - 1; i++) {
        mPatternPositions[i] = i;
    }
    // generate (context, pattern) pairs based on chunks
    final Text basePattern = new Text();
    while (true) {
        // construct context
        for (int i = 0; i < mArity; i++) {
            mContextChunks[i] = mChunks.get(mContextPositions[i]);
        }
        // construct pattern
        for (int i = 0; i < mArity - 1; i++) {
            mPatternChunks[i] = mChunks.get(mPatternPositions[i]);
        }
        // add (context, pattern) pair
        basePattern.clear();
        for (int i = 0; i < mArity - 1; i++) {
            // left chunk type
            basePattern.append(mContextChunks[i].type.getBytes(), 0, mContextChunks[i].type.getLength());
            basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
            if (mContextPositions[i + 1] - mPatternPositions[i] > 1 || mPatternPositions[i] - mContextPositions[i] > 1) {
                if (mPatternPositions[i] == mContextPositions[i]) {
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                } else if (mPatternPositions[i] == mContextPositions[i] + 1) {
                    basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength());
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                } else if (mPatternPositions[i] + 1 == mContextPositions[i + 1]) {
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                    basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength());
                } else {
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                    basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength());
                    basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH);
                }
            } else if (mPatternPositions[i] == mContextPositions[i]) {
                basePattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
            } else {
                basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength());
            }
            basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        }
        // last chunk type
        basePattern.append(mContextChunks[mArity - 1].type.getBytes(), 0, mContextChunks[mArity - 1].type.getLength());
        basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        int[] indices;
        mPermGen.reset();
        while (mPermGen.hasMore()) {
            // get next permutation
            indices = mPermGen.getNext();
            ContextPatternWritable c = new ContextPatternWritable();
            // pattern
            c.setPattern(basePattern);
            Text numLeftText = new Text(mPermGen.getNumLeft() + "/" + mArity);
            c.getPattern().append(numLeftText.getBytes(), 0, numLeftText.getLength());
            // context
            c.getContext().clear();
            for (int i = 0; i < mArity; i++) {
                c.getContext().append(mContextChunks[indices[i]].text.getBytes(), 0, mContextChunks[indices[i]].text.getLength());
                if (i != mArity - 1) {
                    c.getContext().append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
                }
            }
            // add to chunk pairs
            mChunkPairs.add(c);
        }
        // get next set of context and pattern positions
        int pos = mArity - 2;
        while (pos >= 0) {
            if (mPatternPositions[pos] + 1 < mChunks.size() && mPatternPositions[pos] + 1 < mContextPositions[pos + 1]) {
                mPatternPositions[pos]++;
                for (int i = pos + 1; i < mArity - 2; i++) {
                    mPatternPositions[i] = mContextPositions[i];
                }
                break;
            }
            pos--;
        }
        // update the context positions if the pattern positions can't be updated any further
        if (pos < 0) {
            pos = mArity - 1;
            while (pos >= 0) {
                if (mContextPositions[pos] + 1 < mChunks.size() && (pos + 1 >= mArity || mContextPositions[pos + 1] - (mContextPositions[pos] + 1) >= 1) && (pos <= 0 || mContextPositions[pos] - mContextPositions[pos - 1] - 1 <= mMaxSkipSize)) {
                    mContextPositions[pos]++;
                    if (pos < mArity - 1) {
                        mPatternPositions[pos] = mContextPositions[pos];
                    }
                    for (int i = pos + 1; i < mArity; i++) {
                        mContextPositions[i] = mContextPositions[pos] + (i - pos);
                        if (i < mArity - 1) {
                            mPatternPositions[i] = mContextPositions[i];
                        }
                    }
                    break;
                }
                pos--;
            }
            // if neither the context nor the pattern positions can be updated then we're done
            if (pos < 0) {
                // get iterator
                mChunkPairsIter = mChunkPairs.iterator();
                return;
            }
        }
    }
}
Also used : TratzParsedTokenWritable(edu.isi.mavuno.util.TratzParsedTokenWritable) ContextPatternWritable(edu.isi.mavuno.util.ContextPatternWritable) Text(org.apache.hadoop.io.Text)

Example 7 with TratzParsedTokenWritable

use of edu.isi.mavuno.util.TratzParsedTokenWritable in project mavuno by metzlerd.

the class ChunkExtractor method loadChunkPairs.

private void loadChunkPairs() {
    // clear chunk pairs
    mChunkPairs.clear();
    // get sentence
    SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next();
    // extract chunks from sentence
    mChunks.clear();
    mChunkTokens.clear();
    List<TratzParsedTokenWritable> tokens = sentence.getTokens();
    Text lastNETag = new Text();
    for (int i = 0; i < tokens.size(); i++) {
        TratzParsedTokenWritable t = tokens.get(i);
        byte chunkType = t.getChunkTag().getLength() > 0 ? t.getChunkTag().getBytes()[0] : 0;
        Text neTag = t.getNETag();
        if (neTag.compareTo(lastNETag.getBytes(), 0, lastNETag.getLength()) != 0 || (neTag.getLength() == 1 && (neTag.getLength() > 0 && neTag.getBytes()[0] == 'O')) && (chunkType == 'B' || chunkType == 'O')) {
            if (mChunkTokens.size() > 0) {
                // && mChunkType.getBytes()[0] != 'O') {
                Text chunk = createChunk(mChunkTokens, mChunkType);
                mChunks.add(chunk);
            }
            mChunkTokens.clear();
            mChunkType.set(t.getChunkTag());
        }
        mChunkTokens.add(t.getToken());
        lastNETag.set(neTag);
    }
    // handle last chunk in sentence
    if (mChunkTokens.size() > 0) {
        // && mChunkType.getBytes()[0] != 'O') {
        Text chunk = createChunk(mChunkTokens, mChunkType);
        mChunks.add(chunk);
    }
    // generate adjacent (context, pattern) pairs
    for (int patternPos = 0; patternPos < mChunks.size() - 1; patternPos++) {
        Text leftPattern = new Text();
        leftPattern.append(mChunks.get(patternPos).getBytes(), 0, mChunks.get(patternPos).getLength());
        leftPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
        addPair(mChunks.get(patternPos), leftPattern, mChunks.get(patternPos + 1));
        Text rightPattern = new Text();
        rightPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength());
        rightPattern.append(mChunks.get(patternPos + 1).getBytes(), 0, mChunks.get(patternPos + 1).getLength());
        addPair(mChunks.get(patternPos), rightPattern, mChunks.get(patternPos + 1));
    }
    // generate non-adjacent (context, pattern) pairs based on chunks
    for (int patternPos = 0; patternPos < mChunks.size(); patternPos++) {
        for (int leftSkip = 0; leftSkip <= mMaxSkipSize; leftSkip++) {
            if (patternPos - leftSkip - 1 < 0) {
                continue;
            }
            if (mOrContextStyle && !mRightOnlyContextStyle) {
                addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos), ContextPatternWritable.ASTERISK);
            }
            if (mOrContextStyle && mLeftOnlyContextStyle) {
                continue;
            }
            for (int rightSkip = 0; rightSkip <= mMaxSkipSize; rightSkip++) {
                if (patternPos + rightSkip + 1 >= mChunks.size()) {
                    continue;
                }
                // construct (context, pattern) pair
                if (mOrContextStyle) {
                    addPair(ContextPatternWritable.ASTERISK, mChunks.get(patternPos), mChunks.get(patternPos + rightSkip + 1));
                } else {
                    addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos), mChunks.get(patternPos + rightSkip + 1));
                }
            }
        }
    }
    // get iterator
    mChunkPairsIter = mChunkPairs.iterator();
}
Also used : TratzParsedTokenWritable(edu.isi.mavuno.util.TratzParsedTokenWritable) Text(org.apache.hadoop.io.Text)

Aggregations

TratzParsedTokenWritable (edu.isi.mavuno.util.TratzParsedTokenWritable)7 Text (org.apache.hadoop.io.Text)5 ContextPatternWritable (edu.isi.mavuno.util.ContextPatternWritable)2 ArrayList (java.util.ArrayList)2 ArrayListOfInts (edu.umd.cloud9.util.array.ArrayListOfInts)1