Search in sources :

Example 1 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class FuzzyMatcher method buildSimilarityData.

/**
 * Builds the similarity data for color highlight in match window.
 */
public static byte[] buildSimilarityData(Token[] sourceTokens, Token[] matchTokens) {
    int len = matchTokens.length;
    byte[] result = new byte[len];
    boolean leftfound = true;
    for (int i = 0; i < len; i++) {
        result[i] = 0;
        Token righttoken = null;
        if (i + 1 < len) {
            righttoken = matchTokens[i + 1];
        }
        boolean rightfound = (i + 1 == len) || DefaultTokenizer.isContains(sourceTokens, righttoken);
        Token token = matchTokens[i];
        boolean found = DefaultTokenizer.isContains(sourceTokens, token);
        if (found && (!leftfound || !rightfound)) {
            result[i] = StringData.PAIR;
        } else if (!found) {
            result[i] = StringData.UNIQ;
        }
        leftfound = found;
    }
    return result;
}
Also used : Token(org.omegat.util.Token)

Example 2 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class LineLengthLimitWriter method breakAt.

/**
 * Write part of line to specified position, and change token offsets.
 */
void breakAt(int pos, Token[] tokens) throws IOException {
    // Strip and discard whitespace from end of line
    out.write(StringUtil.rstrip(str.substring(0, pos)));
    str.delete(0, pos);
    if (str.length() > 0) {
        writeBreakEol();
    } else {
        writeSourceEol();
    }
    for (int i = 0; i < tokens.length; i++) {
        Token t = tokens[i];
        if (t == null || t.getOffset() < pos) {
            tokens[i] = null;
        } else {
            tokens[i] = new Token(null, t.getOffset() - pos, t.getLength());
        }
    }
}
Also used : Token(org.omegat.util.Token)

Example 3 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class LineLengthLimitWriter method getBreakPos.

int getBreakPos(Token[] tokens) {
    if (str.codePointCount(0, str.length()) <= maxLineLength) {
        // line no longer than max length - use full line
        return str.length();
    }
    // check if spaces only more than max length
    int latestNonSpacesTokenPos = 0;
    for (int i = tokens.length - 1; i >= 0; i--) {
        Token t = tokens[i];
        if (t == null) {
            // less than begin
            continue;
        }
        if (isSpaces(t)) {
            continue;
        }
        // non-spaces token
        latestNonSpacesTokenPos = t.getOffset() + t.getLength();
        break;
    }
    if (str.codePointCount(0, latestNonSpacesTokenPos) <= maxLineLength) {
        return str.length();
    }
    // try to break on the space ends
    int spacesStart = -1;
    for (Token t : tokens) {
        if (t == null) {
            // less than begin
            continue;
        }
        if (str.codePointCount(0, t.getOffset()) >= lineLength) {
            // spaces can be after max length
            if (spacesStart >= 0 && str.codePointCount(0, spacesStart) < maxLineLength) {
                return t.getOffset();
            }
        }
        if (isSpaces(t)) {
            if (spacesStart < 0) {
                spacesStart = t.getOffset();
            }
        } else {
            spacesStart = -1;
        }
    }
    // try to break on the space boundaries
    for (Token t : tokens) {
        if (t == null) {
            // less than begin
            continue;
        }
        int cps = str.codePointCount(0, t.getOffset());
        if (cps >= lineLength && cps < maxLineLength) {
            if (isSpaces(t)) {
                return t.getOffset();
            }
        }
        cps = str.codePointCount(0, t.getOffset() + t.getLength());
        if (cps >= lineLength && cps < maxLineLength) {
            if (isSpaces(t)) {
                return t.getOffset() + t.getLength();
            }
        }
    }
    // impossible to break on space boundaries - break at any token, except brackets
    for (Token t : tokens) {
        if (t == null) {
            // less than begin
            continue;
        }
        int cps = str.codePointCount(0, t.getOffset());
        if (cps >= lineLength && cps < maxLineLength) {
            if (isPossibleBreakBefore(t.getOffset())) {
                return t.getOffset();
            }
        }
        cps = str.codePointCount(0, t.getOffset() + t.getLength());
        if (cps >= lineLength && cps < maxLineLength) {
            if (isPossibleBreakBefore(t.getOffset() + t.getLength())) {
                return t.getOffset() + t.getLength();
            }
        }
    }
    // use latest token before line length
    for (int i = 0; i < tokens.length; i++) {
        Token t = tokens[i];
        if (t == null) {
            // less than begin
            continue;
        }
        if (str.codePointCount(0, t.getOffset()) >= lineLength) {
            if (i == 0) {
                return t.getOffset() + t.getLength();
            }
            int j = i - 1;
            while (j >= 0) {
                Token tp = tokens[j--];
                if (tp != null && tp.getOffset() > 0) {
                    if (isPossibleBreakBefore(tp.getOffset())) {
                        return tp.getOffset();
                    }
                }
            }
            return t.getOffset();
        }
    }
    // use full line
    return str.length();
}
Also used : Token(org.omegat.util.Token)

Example 4 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class MatchesTextArea method setActiveMatch.

/**
 * Sets the index of an active match. It basically highlights the fuzzy
 * match string selected. (numbers start from 0)
 */
@Override
public void setActiveMatch(int activeMatch) {
    UIThreadsUtil.mustBeSwingThread();
    if (activeMatch < 0 || activeMatch >= matches.size() || this.activeMatch == activeMatch) {
        return;
    }
    this.activeMatch = activeMatch;
    StyledDocument doc = (StyledDocument) getDocument();
    doc.setCharacterAttributes(0, doc.getLength(), ATTRIBUTES_EMPTY, true);
    int start = delimiters.get(activeMatch);
    int end = delimiters.get(activeMatch + 1);
    NearString match = matches.get(activeMatch);
    // List tokens = match.str.getSrcTokenList();
    ITokenizer tokenizer = Core.getProject().getSourceTokenizer();
    if (tokenizer == null) {
        return;
    }
    // Apply sourceText styling
    if (sourcePos.get(activeMatch) != -1) {
        Token[] tokens = tokenizer.tokenizeVerbatim(match.source);
        // fix for bug 1586397
        byte[] attributes = match.attr;
        for (int i = 0; i < tokens.length; i++) {
            Token token = tokens[i];
            int tokstart = start + sourcePos.get(activeMatch) + token.getOffset();
            int toklength = token.getLength();
            if ((attributes[i] & StringData.UNIQ) != 0) {
                doc.setCharacterAttributes(tokstart, toklength, ATTRIBUTES_CHANGED, false);
            } else if ((attributes[i] & StringData.PAIR) != 0) {
                doc.setCharacterAttributes(tokstart, toklength, ATTRIBUTES_UNCHANGED, false);
            }
        }
    }
    // Iterate through (up to) 5 fuzzy matches
    for (int i = 0; i < diffInfos.size(); i++) {
        Map<Integer, List<TextRun>> diffInfo = diffInfos.get(i);
        // Iterate through each diff variant (${diff}, ${diffReversed}, ...)
        for (Entry<Integer, List<TextRun>> e : diffInfo.entrySet()) {
            int diffPos = e.getKey();
            if (diffPos != -1) {
                // Iterate through each style chunk (added or deleted)
                for (TextRun r : e.getValue()) {
                    int tokstart = delimiters.get(i) + diffPos + r.start;
                    switch(r.type) {
                        case DELETE:
                            doc.setCharacterAttributes(tokstart, r.length, i == activeMatch ? ATTRIBUTES_DELETED_ACTIVE : ATTRIBUTES_DELETED_INACTIVE, false);
                            break;
                        case INSERT:
                            doc.setCharacterAttributes(tokstart, r.length, i == activeMatch ? ATTRIBUTES_INSERTED_ACTIVE : ATTRIBUTES_INSERTED_INACTIVE, false);
                            break;
                        case NOCHANGE:
                    }
                }
            }
        }
    }
    doc.setCharacterAttributes(start, end - start, ATTRIBUTES_SELECTED, false);
    // two newlines
    setCaretPosition(end - 2);
    final int fstart = start;
    SwingUtilities.invokeLater(new Runnable() {

        @Override
        public void run() {
            setCaretPosition(fstart);
        }
    });
}
Also used : StyledDocument(javax.swing.text.StyledDocument) Token(org.omegat.util.Token) NearString(org.omegat.core.matching.NearString) TextRun(org.omegat.core.matching.DiffDriver.TextRun) Point(java.awt.Point) ITokenizer(org.omegat.tokenizer.ITokenizer) List(java.util.List) ArrayList(java.util.ArrayList)

Example 5 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class CalcMatchStatistics method calcMaxSimilarity.

int calcMaxSimilarity(SourceTextEntry ste) {
    String srcNoXmlTags = removeXmlTags(ste);
    FindMatches localFinder = finder.get();
    List<NearString> nears = localFinder.search(srcNoXmlTags, true, false, this::isInterrupted);
    final Token[] strTokensStem = localFinder.tokenizeAll(ste.getSrcText());
    int maxSimilarity = 0;
    CACHE: for (NearString near : nears) {
        final Token[] candTokens = localFinder.tokenizeAll(near.source);
        int newSimilarity = FuzzyMatcher.calcSimilarity(distanceCalculator.get(), strTokensStem, candTokens);
        if (near.fuzzyMark) {
            newSimilarity -= FindMatches.PENALTY_FOR_FUZZY;
        }
        if (newSimilarity > maxSimilarity) {
            maxSimilarity = newSimilarity;
            if (newSimilarity >= 95) {
                // enough to say that we are in row 2
                break CACHE;
            }
        }
    }
    return maxSimilarity;
}
Also used : Token(org.omegat.util.Token) NearString(org.omegat.core.matching.NearString) NearString(org.omegat.core.matching.NearString)

Aggregations

Token (org.omegat.util.Token)19 ArrayList (java.util.ArrayList)8 NearString (org.omegat.core.matching.NearString)3 List (java.util.List)2 ITokenizer (org.omegat.tokenizer.ITokenizer)2 Point (java.awt.Point)1 IOException (java.io.IOException)1 BreakIterator (java.text.BreakIterator)1 Arrays (java.util.Arrays)1 Collections (java.util.Collections)1 Comparator (java.util.Comparator)1 HashSet (java.util.HashSet)1 Matcher (java.util.regex.Matcher)1 JMenuItem (javax.swing.JMenuItem)1 HighlightPainter (javax.swing.text.Highlighter.HighlightPainter)1 StyledDocument (javax.swing.text.StyledDocument)1 TokenStream (org.apache.lucene.analysis.TokenStream)1 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)1 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)1 Test (org.junit.Test)1