Search in sources :

Example 6 with NearString

use of org.omegat.core.matching.NearString in project omegat by omegat-org.

the class CalcMatchStatistics method calcMaxSimilarity.

int calcMaxSimilarity(SourceTextEntry ste) {
    String srcNoXmlTags = removeXmlTags(ste);
    FindMatches localFinder = finder.get();
    List<NearString> nears = localFinder.search(srcNoXmlTags, true, false, this::isInterrupted);
    final Token[] strTokensStem = localFinder.tokenizeAll(ste.getSrcText());
    int maxSimilarity = 0;
    CACHE: for (NearString near : nears) {
        final Token[] candTokens = localFinder.tokenizeAll(near.source);
        int newSimilarity = FuzzyMatcher.calcSimilarity(distanceCalculator.get(), strTokensStem, candTokens);
        if (near.fuzzyMark) {
            newSimilarity -= FindMatches.PENALTY_FOR_FUZZY;
        }
        if (newSimilarity > maxSimilarity) {
            maxSimilarity = newSimilarity;
            if (newSimilarity >= 95) {
                // enough to say that we are in row 2
                break CACHE;
            }
        }
    }
    return maxSimilarity;
}
Also used : Token(org.omegat.util.Token) NearString(org.omegat.core.matching.NearString) NearString(org.omegat.core.matching.NearString)

Example 7 with NearString

use of org.omegat.core.matching.NearString in project omegat by omegat-org.

the class FindMatches method haveChanceToAdd.

/**
 * Check if entry have a chance to be added to result list. If no, there is no sense to calculate other
 * parameters.
 *
 * @param simStem
 *            similarity with stemming
 * @param simNoStem
 *            similarity without stemming
 * @param simExactly
 *            exactly similarity
 * @return true if we have chance
 */
protected boolean haveChanceToAdd(final int simStem, final int simNoStem, final int simExactly) {
    if (simStem < OConsts.FUZZY_MATCH_THRESHOLD && simNoStem < OConsts.FUZZY_MATCH_THRESHOLD) {
        return false;
    }
    if (result.size() < maxCount) {
        return true;
    }
    NearString st = result.get(result.size() - 1);
    int chance = Integer.compare(st.scores[0].score, simStem);
    if (chance == 0) {
        chance = Integer.compare(st.scores[0].scoreNoStem, simNoStem);
    }
    if (chance == 0) {
        chance = Integer.compare(st.scores[0].adjustedScore, simExactly);
    }
    return chance != 1;
}
Also used : NearString(org.omegat.core.matching.NearString)

Example 8 with NearString

use of org.omegat.core.matching.NearString in project omegat by omegat-org.

the class FindMatches method addNearString.

/**
 * Add near string into result list. Near strings sorted by "similarity,simAdjusted"
 */
protected void addNearString(final EntryKey key, final String source, final String translation, NearString.MATCH_SOURCE comesFrom, final boolean fuzzy, final int similarity, final int similarityNoStem, final int simAdjusted, final byte[] similarityData, final String tmxName, final String creator, final long creationDate, final String changer, final long changedDate, final List<TMXProp> tuProperties) {
    // find position for new data
    int pos = 0;
    for (int i = 0; i < result.size(); i++) {
        NearString st = result.get(i);
        if (source.equals(st.source) && Objects.equals(translation, st.translation)) {
            // Consolidate identical matches from different sources into a single NearString with
            // multiple project entries.
            result.set(i, NearString.merge(st, key, source, translation, comesFrom, fuzzy, similarity, similarityNoStem, simAdjusted, similarityData, tmxName, creator, creationDate, changer, changedDate, tuProperties));
            return;
        }
        if (st.scores[0].score < similarity) {
            break;
        }
        if (st.scores[0].score == similarity) {
            if (st.scores[0].scoreNoStem < similarityNoStem) {
                break;
            }
            if (st.scores[0].scoreNoStem == similarityNoStem) {
                if (st.scores[0].adjustedScore < simAdjusted) {
                    break;
                }
                // text with the same case has precedence
                if (similarity == 100 && !st.source.equals(srcText) && source.equals(srcText)) {
                    break;
                }
            }
        }
        pos = i + 1;
    }
    result.add(pos, new NearString(key, source, translation, comesFrom, fuzzy, similarity, similarityNoStem, simAdjusted, similarityData, tmxName, creator, creationDate, changer, changedDate, tuProperties));
    if (result.size() > maxCount) {
        result.remove(result.size() - 1);
    }
}
Also used : NearString(org.omegat.core.matching.NearString)

Example 9 with NearString

use of org.omegat.core.matching.NearString in project omegat by omegat-org.

the class FindMatches method search.

public List<NearString> search(final String searchText, final boolean requiresTranslation, final boolean fillSimilarityData, final IStopped stop) throws StoppedException {
    result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
    srcText = searchText;
    removedText = "";
    // of the translatable text
    if (removePattern != null) {
        StringBuilder removedBuffer = new StringBuilder();
        Matcher removeMatcher = removePattern.matcher(srcText);
        while (removeMatcher.find()) {
            removedBuffer.append(removeMatcher.group());
        }
        srcText = removeMatcher.replaceAll("");
        removedText = removedBuffer.toString();
    }
    // get tokens for original string
    strTokensStem = tokenizeStem(srcText);
    strTokensNoStem = tokenizeNoStem(srcText);
    strTokensAll = tokenizeAll(srcText);
    // travel by project entries, including orphaned
    if (project.getProjectProperties().isSupportDefaultTranslations()) {
        project.iterateByDefaultTranslations(new DefaultTranslationsIterator() {

            public void iterate(String source, TMXEntry trans) {
                checkStopped(stop);
                if (!searchExactlyTheSame && source.equals(searchText)) {
                    // skip original==original entry comparison
                    return;
                }
                if (requiresTranslation && trans.translation == null) {
                    return;
                }
                String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
                processEntry(null, source, trans.translation, NearString.MATCH_SOURCE.MEMORY, false, 0, fileName, trans.creator, trans.creationDate, trans.changer, trans.changeDate, null);
            }
        });
    }
    project.iterateByMultipleTranslations(new MultipleTranslationsIterator() {

        public void iterate(EntryKey source, TMXEntry trans) {
            checkStopped(stop);
            if (!searchExactlyTheSame && source.sourceText.equals(searchText)) {
                // skip original==original entry comparison
                return;
            }
            if (requiresTranslation && trans.translation == null) {
                return;
            }
            String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
            processEntry(source, source.sourceText, trans.translation, NearString.MATCH_SOURCE.MEMORY, false, 0, fileName, trans.creator, trans.creationDate, trans.changer, trans.changeDate, null);
        }
    });
    // travel by translation memories
    for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
        int penalty = 0;
        Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
        if (matcher.find()) {
            penalty = Integer.parseInt(matcher.group(1));
        }
        for (PrepareTMXEntry tmen : en.getValue().getEntries()) {
            checkStopped(stop);
            if (tmen.source == null) {
                // Not all TMX entries have a source; in that case there can be no meaningful match, so skip.
                continue;
            }
            if (requiresTranslation && tmen.translation == null) {
                continue;
            }
            processEntry(null, tmen.source, tmen.translation, NearString.MATCH_SOURCE.TM, false, penalty, en.getKey(), tmen.creator, tmen.creationDate, tmen.changer, tmen.changeDate, tmen.otherProperties);
        }
    }
    // travel by all entries for check source file translations
    for (SourceTextEntry ste : project.getAllEntries()) {
        checkStopped(stop);
        if (ste.getSourceTranslation() != null) {
            processEntry(ste.getKey(), ste.getSrcText(), ste.getSourceTranslation(), NearString.MATCH_SOURCE.MEMORY, ste.isSourceTranslationFuzzy(), 0, ste.getKey().file, "", 0, "", 0, null);
        }
    }
    if (separateSegmentMatcher != null) {
        // split paragraph even when segmentation disabled, then find matches for every segment
        List<StringBuilder> spaces = new ArrayList<StringBuilder>();
        List<Rule> brules = new ArrayList<Rule>();
        Language sourceLang = project.getProjectProperties().getSourceLanguage();
        Language targetLang = project.getProjectProperties().getTargetLanguage();
        List<String> segments = Core.getSegmenter().segment(sourceLang, srcText, spaces, brules);
        if (segments.size() > 1) {
            List<String> fsrc = new ArrayList<String>(segments.size());
            List<String> ftrans = new ArrayList<String>(segments.size());
            // multiple segments
            for (short i = 0; i < segments.size(); i++) {
                String onesrc = segments.get(i);
                // find match for separate segment
                List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation, false, stop);
                if (!segmentMatch.isEmpty() && segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
                    fsrc.add(segmentMatch.get(0).source);
                    ftrans.add(segmentMatch.get(0).translation);
                } else {
                    fsrc.add("");
                    ftrans.add("");
                }
            }
            // glue found sources
            String foundSrc = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules);
            // glue found translations
            String foundTrans = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules);
            processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM, false, 0, "", "", 0, "", 0, null);
        }
    }
    if (fillSimilarityData) {
        // fill similarity data only for result
        for (NearString near : result) {
            // fix for bug 1586397
            byte[] similarityData = FuzzyMatcher.buildSimilarityData(strTokensAll, tokenizeAll(near.source));
            near.attr = similarityData;
        }
    }
    return result;
}
Also used : EntryKey(org.omegat.core.data.EntryKey) Matcher(java.util.regex.Matcher) FuzzyMatcher(org.omegat.core.matching.FuzzyMatcher) ArrayList(java.util.ArrayList) NearString(org.omegat.core.matching.NearString) NearString(org.omegat.core.matching.NearString) Language(org.omegat.util.Language) SourceTextEntry(org.omegat.core.data.SourceTextEntry) DefaultTranslationsIterator(org.omegat.core.data.IProject.DefaultTranslationsIterator) ExternalTMX(org.omegat.core.data.ExternalTMX) Rule(org.omegat.core.segmentation.Rule) HashMap(java.util.HashMap) Map(java.util.Map) PrepareTMXEntry(org.omegat.core.data.PrepareTMXEntry) PrepareTMXEntry(org.omegat.core.data.PrepareTMXEntry) TMXEntry(org.omegat.core.data.TMXEntry) MultipleTranslationsIterator(org.omegat.core.data.IProject.MultipleTranslationsIterator)

Example 10 with NearString

use of org.omegat.core.matching.NearString in project omegat by omegat-org.

the class MainWindow method doRecycleTrans.

/**
 * replace entire edit area with active fuzzy match or selection
 */
public void doRecycleTrans() {
    if (!Core.getProject().isProjectLoaded()) {
        return;
    }
    String selection = getSelectedTextInMatcher();
    if (!StringUtil.isEmpty(selection)) {
        Core.getEditor().replaceEditText(selection);
        Core.getEditor().requestFocus();
        return;
    }
    NearString near = Core.getMatcher().getActiveMatch();
    if (near != null) {
        String translation = near.translation;
        if (Preferences.isPreference(Preferences.CONVERT_NUMBERS)) {
            translation = Core.getMatcher().substituteNumbers(Core.getEditor().getCurrentEntry().getSrcText(), near.source, near.translation);
        }
        if (near.comesFrom == NearString.MATCH_SOURCE.TM && FileUtil.isInPath(new File(Core.getProject().getProjectProperties().getTMRoot(), "mt"), new File(near.projs[0]))) {
            Core.getEditor().replaceEditTextAndMark(translation);
        } else {
            Core.getEditor().replaceEditText(translation);
        }
        Core.getEditor().requestFocus();
    }
}
Also used : NearString(org.omegat.core.matching.NearString) NearString(org.omegat.core.matching.NearString) File(java.io.File)

Aggregations

NearString (org.omegat.core.matching.NearString)12 Point (java.awt.Point)4 File (java.io.File)3 ArrayList (java.util.ArrayList)3 SourceTextEntry (org.omegat.core.data.SourceTextEntry)3 TMXEntry (org.omegat.core.data.TMXEntry)3 ActionEvent (java.awt.event.ActionEvent)2 ActionListener (java.awt.event.ActionListener)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Map (java.util.Map)2 JMenuItem (javax.swing.JMenuItem)2 StyledDocument (javax.swing.text.StyledDocument)2 TextRun (org.omegat.core.matching.DiffDriver.TextRun)2 Token (org.omegat.util.Token)2 Component (java.awt.Component)1 Dimension (java.awt.Dimension)1 MouseAdapter (java.awt.event.MouseAdapter)1 MouseEvent (java.awt.event.MouseEvent)1 MouseListener (java.awt.event.MouseListener)1