Search in sources :

Example 1 with TratzParsedTokenWritable

use of edu.isi.mavuno.util.TratzParsedTokenWritable in project mavuno by metzlerd.

the class NLProcTools method getTaggedTokens.

public static List<TratzParsedTokenWritable> getTaggedTokens(List<Token> tokens, List<String> posTags, List<String> neTags) {
    // make sure that # tokens = # pos tags = # ne tags
    if (tokens.size() != posTags.size() || posTags.size() != neTags.size()) {
        throw new IllegalArgumentException("Arguments must satisfy the condition: tokens.size() == posTags.size() == neTags.size()");
    }
    List<TratzParsedTokenWritable> taggedTokens = new ArrayList<TratzParsedTokenWritable>();
    for (int i = 0; i < tokens.size(); i++) {
        // construct tagged token
        TratzParsedTokenWritable t = new TratzParsedTokenWritable();
        t.setToken(tokens.get(i).getText());
        t.setPosTag(posTags.get(i));
        t.setNETag(neTags.get(i));
        taggedTokens.add(t);
    }
    return taggedTokens;
}
Also used : TratzParsedTokenWritable(edu.isi.mavuno.util.TratzParsedTokenWritable) ArrayList(java.util.ArrayList)

Example 2 with TratzParsedTokenWritable

use of edu.isi.mavuno.util.TratzParsedTokenWritable in project mavuno by metzlerd.

the class NLProcTools method getPOSSignature.

private static String getPOSSignature(List<TratzParsedTokenWritable> tokens, int chunkStart, int chunkEnd) {
    String signature = "";
    for (int i = chunkStart; i < chunkEnd; i++) {
        TratzParsedTokenWritable t = tokens.get(i);
        String term = t.getToken().toString();
        String posTag = t.getPosTag().toString();
        String neTag = t.getNETag().toString();
        if (neTag.equals("LOCATION") || neTag.equals("PERSON") || neTag.equals("ORGANIZATION")) {
            signature += "N";
        } else if (posTag.startsWith("NN") || posTag.equals("VBG") || posTag.equals("POS") || posTag.equals("CD") || posTag.startsWith("PR") || posTag.startsWith("WP")) {
            signature += "N";
        } else if (posTag.startsWith("JJ")) {
            signature += "A";
        } else if (posTag.startsWith("IN") || posTag.startsWith("TO")) {
            signature += "P";
        } else if (term.equals(",") || term.equals("and")) {
            signature += "C";
        } else if (posTag.charAt(0) < 'A' || posTag.charAt(0) > 'Z') {
            signature += "X";
        } else if (posTag.equals("DT")) {
            signature += "D";
        } else {
            signature += "O";
        }
    }
    return signature;
}
Also used : TratzParsedTokenWritable(edu.isi.mavuno.util.TratzParsedTokenWritable)

Example 3 with TratzParsedTokenWritable

use of edu.isi.mavuno.util.TratzParsedTokenWritable in project mavuno by metzlerd.

the class NLProcTools method extractMainChunk.

private static Text extractMainChunk(int start, int end, List<TratzParsedTokenWritable> tokens, boolean appendPOSTag) {
    StringBuffer chunk = new StringBuffer();
    String tag = "O";
    boolean firstWord = true;
    for (int i = start; i < end; i++) {
        TratzParsedTokenWritable t = tokens.get(i);
        String text = t.getToken().toString();
        String neTag = t.getNETag().toString();
        // exclude special characters and punctuation from the beginning and end of the chunk
        if (isPunctuation(text)) {
            continue;
        }
        // get (approximate) NE type
        if (!"O".equals(neTag)) {
            tag = neTag;
        }
        if (firstWord) {
            firstWord = false;
        } else {
            chunk.append(' ');
        }
        chunk.append(text);
    }
    // return null if no valid terms are found
    if (chunk.length() == 0) {
        return null;
    }
    if (appendPOSTag) {
        chunk.append("|");
        chunk.append(tag);
    }
    return new Text(chunk.toString());
}
Also used : TratzParsedTokenWritable(edu.isi.mavuno.util.TratzParsedTokenWritable) Text(org.apache.hadoop.io.Text)

Example 4 with TratzParsedTokenWritable

use of edu.isi.mavuno.util.TratzParsedTokenWritable in project mavuno by metzlerd.

the class DIRTExtractor method getContext.

private List<ContextPatternWritable> getContext(ArrayListOfInts leftPath, ArrayListOfInts rightPath, List<TratzParsedTokenWritable> tokens, Text[] chunks) {
    //, int leftContextSize, int rightContextSize) {
    // construct (context, pattern) pairs
    List<ContextPatternWritable> contexts = new ArrayList<ContextPatternWritable>();
    // make sure that the dimensions are feasible
    if (leftPath.size() < 1 || rightPath.size() < 1) {
        return contexts;
    }
    // make sure we don't split the left context's chunk
    Text leftChunk = chunks[leftPath.get(0) - 1];
    for (int i = 1; i <= leftPath.size() - 1; i++) {
        if (chunks[leftPath.get(i) - 1].equals(leftChunk)) {
            return contexts;
        }
    }
    // make sure we don't split the right context's chunk
    Text rightChunk = chunks[rightPath.get(0) - 1];
    for (int i = rightPath.size() - 1; i >= 1; i--) {
        if (chunks[rightPath.get(i) - 1].equals(rightChunk)) {
            return contexts;
        }
    }
    TratzParsedTokenWritable t;
    Text term, posTag, dependType;
    // construct pattern based on the parse tree path
    final Text pattern = new Text();
    // encode left context chunk type
    // TODO: replace this with a more robust way of checking if this is an actual named entity or not
    Text leftNETag = tokens.get(leftPath.get(0) - 1).getNETag();
    Text leftChunkTag = tokens.get(leftPath.get(0) - 1).getChunkTag();
    if (leftNETag.getLength() != 1 || (leftNETag.getLength() > 0 && leftNETag.getBytes()[0] != 'O')) {
        pattern.append(leftNETag.getBytes(), 0, leftNETag.getLength());
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    } else {
        if (leftChunkTag.getLength() > 2) {
            pattern.append(leftChunkTag.getBytes(), 2, leftChunkTag.getLength() - 2);
        } else {
            pattern.append(leftChunkTag.getBytes(), 0, leftChunkTag.getLength());
        }
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    }
    // left path portion of pattern
    if (!mHeadOnly) {
        for (int i = 0; i <= leftPath.size() - 2; i++) {
            t = tokens.get(leftPath.get(i) - 1);
            term = mUseLemmas ? t.getLemma() : t.getToken();
            dependType = t.getDependType();
            posTag = t.getPosTag();
            if (i != 0) {
                pattern.append(term.getBytes(), 0, term.getLength());
                pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
                pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
                pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
            }
            pattern.append(dependType.getBytes(), 0, dependType.getLength());
            pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        }
    } else {
        dependType = tokens.get(leftPath.get(0) - 1).getDependType();
        posTag = tokens.get(leftPath.get(0) - 1).getPosTag();
        pattern.append(dependType.getBytes(), 0, dependType.getLength());
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    }
    // root portion of pattern
    if (leftPath.get(leftPath.size() - 1) != rightPath.get(rightPath.size() - 1)) {
        throw new RuntimeException("Left and right paths do not share the same root! -- " + leftPath + " <--> " + rightPath);
    }
    t = tokens.get(leftPath.get(leftPath.size() - 1) - 1);
    term = mUseLemmas ? t.getLemma() : t.getToken();
    dependType = t.getDependType();
    posTag = t.getPosTag();
    pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
    pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    pattern.append(term.getBytes(), 0, term.getLength());
    pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
    pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
    // right path portion of pattern
    if (!mHeadOnly) {
        for (int i = rightPath.size() - 2; i >= 0; i--) {
            t = tokens.get(rightPath.get(i) - 1);
            term = mUseLemmas ? t.getLemma() : t.getToken();
            dependType = t.getDependType();
            posTag = t.getPosTag();
            pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
            pattern.append(dependType.getBytes(), 0, dependType.getLength());
            if (i != 0) {
                pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
                pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength());
                pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
                pattern.append(term.getBytes(), 0, term.getLength());
            }
        }
    } else {
        dependType = tokens.get(rightPath.get(0) - 1).getDependType();
        posTag = tokens.get(rightPath.get(0) - 1).getPosTag();
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        pattern.append(dependType.getBytes(), 0, dependType.getLength());
    }
    // encode right context chunk type
    // TODO: replace this with a more robust way of checking if this is an actual named entity or not
    Text rightNETag = tokens.get(rightPath.get(0) - 1).getNETag();
    Text rightChunkTag = tokens.get(rightPath.get(0) - 1).getChunkTag();
    if (rightNETag.getLength() != 1 || (rightNETag.getLength() > 0 && rightNETag.getBytes()[0] != 'O')) {
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        pattern.append(rightNETag.getBytes(), 0, rightNETag.getLength());
    } else {
        pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH);
        if (rightChunkTag.getLength() > 2) {
            pattern.append(rightChunkTag.getBytes(), 2, rightChunkTag.getLength() - 2);
        } else {
            pattern.append(rightChunkTag.getBytes(), 0, rightChunkTag.getLength());
        }
    }
    if (mOrContextStyle) {
        if (!mRightOnlyContextStyle) {
            ContextPatternWritable c = new ContextPatternWritable();
            c.setContext(MavunoUtils.createContext(leftChunk, MavunoUtils.ASTERISK));
            c.setPattern(pattern);
            contexts.add(c);
        }
        if (!mLeftOnlyContextStyle) {
            ContextPatternWritable c = new ContextPatternWritable();
            c.setContext(MavunoUtils.createContext(MavunoUtils.ASTERISK, rightChunk));
            c.setPattern(pattern);
            contexts.add(c);
        }
    } else {
        ContextPatternWritable c = new ContextPatternWritable();
        c.setContext(MavunoUtils.createContext(leftChunk, rightChunk));
        c.setPattern(pattern);
        contexts.add(c);
    }
    return contexts;
}
Also used : TratzParsedTokenWritable(edu.isi.mavuno.util.TratzParsedTokenWritable) ContextPatternWritable(edu.isi.mavuno.util.ContextPatternWritable) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text)

Example 5 with TratzParsedTokenWritable

use of edu.isi.mavuno.util.TratzParsedTokenWritable in project mavuno by metzlerd.

the class DIRTExtractor method loadDependPairs.

private void loadDependPairs() {
    // clear dependency pairs
    mDependPairs.clear();
    // get sentence
    SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next();
    // get sentence tokens
    List<TratzParsedTokenWritable> tokens = sentence.getTokens();
    // get chunk ids
    int[] chunkIds = NLProcTools.getChunkIds(tokens);
    // get mapping from positions to chunks
    Text[] chunks = new Text[tokens.size()];
    Text curChunk = null;
    for (int i = 0; i < tokens.size(); i++) {
        Text text = tokens.get(i).getToken();
        if (i == 0 || (i > 0 && chunkIds[i] != chunkIds[i - 1])) {
            curChunk = new Text(text);
        } else {
            curChunk.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
            curChunk.append(text.getBytes(), 0, text.getLength());
        }
        chunks[i] = curChunk;
    }
    // populate parse tree
    ArrayListOfInts[] children = new ArrayListOfInts[tokens.size() + 1];
    for (int i = 0; i < tokens.size() + 1; i++) {
        children[i] = new ArrayListOfInts();
    }
    for (int i = 0; i < tokens.size(); i++) {
        TratzParsedTokenWritable t = tokens.get(i);
        // ignore punctuation
        if (!t.getDependType().equals(PUNCTUATION_TYPE)) {
            children[t.getDependIndex()].add(i + 1);
        }
    }
    // extract (context, pattern) pairs from parse tree
    for (int i = 0; i < children[0].size(); i++) {
        extractPairs(children, children[0].get(i), tokens, chunks);
    }
    // get iterator
    mDependPairsIter = mDependPairs.iterator();
}
Also used : TratzParsedTokenWritable(edu.isi.mavuno.util.TratzParsedTokenWritable) ArrayListOfInts(edu.umd.cloud9.util.array.ArrayListOfInts) Text(org.apache.hadoop.io.Text)

Aggregations

TratzParsedTokenWritable (edu.isi.mavuno.util.TratzParsedTokenWritable)7 Text (org.apache.hadoop.io.Text)5 ContextPatternWritable (edu.isi.mavuno.util.ContextPatternWritable)2 ArrayList (java.util.ArrayList)2 ArrayListOfInts (edu.umd.cloud9.util.array.ArrayListOfInts)1