Search in sources :

Example 6 with Rule

use of org.omegat.core.segmentation.Rule in project omegat by omegat-org.

the class FindMatches method search.

public List<NearString> search(final String searchText, final boolean requiresTranslation, final boolean fillSimilarityData, final IStopped stop) throws StoppedException {
    result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
    srcText = searchText;
    removedText = "";
    // of the translatable text
    if (removePattern != null) {
        StringBuilder removedBuffer = new StringBuilder();
        Matcher removeMatcher = removePattern.matcher(srcText);
        while (removeMatcher.find()) {
            removedBuffer.append(removeMatcher.group());
        }
        srcText = removeMatcher.replaceAll("");
        removedText = removedBuffer.toString();
    }
    // get tokens for original string
    strTokensStem = tokenizeStem(srcText);
    strTokensNoStem = tokenizeNoStem(srcText);
    strTokensAll = tokenizeAll(srcText);
    // travel by project entries, including orphaned
    if (project.getProjectProperties().isSupportDefaultTranslations()) {
        project.iterateByDefaultTranslations(new DefaultTranslationsIterator() {

            public void iterate(String source, TMXEntry trans) {
                checkStopped(stop);
                if (!searchExactlyTheSame && source.equals(searchText)) {
                    // skip original==original entry comparison
                    return;
                }
                if (requiresTranslation && trans.translation == null) {
                    return;
                }
                String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
                processEntry(null, source, trans.translation, NearString.MATCH_SOURCE.MEMORY, false, 0, fileName, trans.creator, trans.creationDate, trans.changer, trans.changeDate, null);
            }
        });
    }
    project.iterateByMultipleTranslations(new MultipleTranslationsIterator() {

        public void iterate(EntryKey source, TMXEntry trans) {
            checkStopped(stop);
            if (!searchExactlyTheSame && source.sourceText.equals(searchText)) {
                // skip original==original entry comparison
                return;
            }
            if (requiresTranslation && trans.translation == null) {
                return;
            }
            String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
            processEntry(source, source.sourceText, trans.translation, NearString.MATCH_SOURCE.MEMORY, false, 0, fileName, trans.creator, trans.creationDate, trans.changer, trans.changeDate, null);
        }
    });
    // travel by translation memories
    for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
        int penalty = 0;
        Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
        if (matcher.find()) {
            penalty = Integer.parseInt(matcher.group(1));
        }
        for (PrepareTMXEntry tmen : en.getValue().getEntries()) {
            checkStopped(stop);
            if (tmen.source == null) {
                // Not all TMX entries have a source; in that case there can be no meaningful match, so skip.
                continue;
            }
            if (requiresTranslation && tmen.translation == null) {
                continue;
            }
            processEntry(null, tmen.source, tmen.translation, NearString.MATCH_SOURCE.TM, false, penalty, en.getKey(), tmen.creator, tmen.creationDate, tmen.changer, tmen.changeDate, tmen.otherProperties);
        }
    }
    // travel by all entries for check source file translations
    for (SourceTextEntry ste : project.getAllEntries()) {
        checkStopped(stop);
        if (ste.getSourceTranslation() != null) {
            processEntry(ste.getKey(), ste.getSrcText(), ste.getSourceTranslation(), NearString.MATCH_SOURCE.MEMORY, ste.isSourceTranslationFuzzy(), 0, ste.getKey().file, "", 0, "", 0, null);
        }
    }
    if (separateSegmentMatcher != null) {
        // split paragraph even when segmentation disabled, then find matches for every segment
        List<StringBuilder> spaces = new ArrayList<StringBuilder>();
        List<Rule> brules = new ArrayList<Rule>();
        Language sourceLang = project.getProjectProperties().getSourceLanguage();
        Language targetLang = project.getProjectProperties().getTargetLanguage();
        List<String> segments = Core.getSegmenter().segment(sourceLang, srcText, spaces, brules);
        if (segments.size() > 1) {
            List<String> fsrc = new ArrayList<String>(segments.size());
            List<String> ftrans = new ArrayList<String>(segments.size());
            // multiple segments
            for (short i = 0; i < segments.size(); i++) {
                String onesrc = segments.get(i);
                // find match for separate segment
                List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation, false, stop);
                if (!segmentMatch.isEmpty() && segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
                    fsrc.add(segmentMatch.get(0).source);
                    ftrans.add(segmentMatch.get(0).translation);
                } else {
                    fsrc.add("");
                    ftrans.add("");
                }
            }
            // glue found sources
            String foundSrc = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules);
            // glue found translations
            String foundTrans = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules);
            processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM, false, 0, "", "", 0, "", 0, null);
        }
    }
    if (fillSimilarityData) {
        // fill similarity data only for result
        for (NearString near : result) {
            // fix for bug 1586397
            byte[] similarityData = FuzzyMatcher.buildSimilarityData(strTokensAll, tokenizeAll(near.source));
            near.attr = similarityData;
        }
    }
    return result;
}
Also used : EntryKey(org.omegat.core.data.EntryKey) Matcher(java.util.regex.Matcher) FuzzyMatcher(org.omegat.core.matching.FuzzyMatcher) ArrayList(java.util.ArrayList) NearString(org.omegat.core.matching.NearString) NearString(org.omegat.core.matching.NearString) Language(org.omegat.util.Language) SourceTextEntry(org.omegat.core.data.SourceTextEntry) DefaultTranslationsIterator(org.omegat.core.data.IProject.DefaultTranslationsIterator) ExternalTMX(org.omegat.core.data.ExternalTMX) Rule(org.omegat.core.segmentation.Rule) HashMap(java.util.HashMap) Map(java.util.Map) PrepareTMXEntry(org.omegat.core.data.PrepareTMXEntry) PrepareTMXEntry(org.omegat.core.data.PrepareTMXEntry) TMXEntry(org.omegat.core.data.TMXEntry) MultipleTranslationsIterator(org.omegat.core.data.IProject.MultipleTranslationsIterator)

Example 7 with Rule

use of org.omegat.core.segmentation.Rule in project omegat by omegat-org.

the class ParseEntry method addEntryWithProperties.

/**
 * This method is called by filters to add new entry in OmegaT after read it from source file.
 *
 * @param id
 *            ID of entry, if format supports it
 * @param source
 *            Translatable source string
 * @param translation
 *            translation of the source string, if format supports it
 * @param isFuzzy
 *            flag for fuzzy translation. If a translation is fuzzy, it is not added to the projects TMX,
 *            but it is added to the generated 'reference' TMX, a special TMX that is used as extra
 *            reference during translation.
 * @param props
 *            a staggered array of non-uniquely-identifying key=value properties (metadata) for the entry
 * @param path
 *            path of entry in file
 * @param filter
 *            filter which produces entry
 * @param protectedParts
 *            protected parts
 */
@Override
public void addEntryWithProperties(String id, String source, String translation, boolean isFuzzy, String[] props, String path, IFilter filter, List<ProtectedPart> protectedParts) {
    if (StringUtil.isEmpty(source)) {
        // empty string - not need to save
        return;
    }
    if (props != null && props.length % 2 != 0) {
        throw new IllegalArgumentException("Entry properties must be in a key=value array with an even number of items.");
    }
    ParseEntryResult tmp = new ParseEntryResult();
    boolean removeSpaces = Core.getFilterMaster().getConfig().isRemoveSpacesNonseg();
    source = stripSomeChars(source, tmp, config.isRemoveTags(), removeSpaces);
    source = StringUtil.normalizeUnicode(source);
    if (config.isRemoveTags() && protectedParts != null) {
        for (int i = 0; i < protectedParts.size(); i++) {
            ProtectedPart p = protectedParts.get(i);
            String s = p.getTextInSourceSegment();
            s = PatternConsts.OMEGAT_TAG.matcher(s).replaceAll("");
            if (s.isEmpty()) {
                protectedParts.remove(i);
                i--;
            } else {
                p.setTextInSourceSegment(s);
            }
        }
    }
    if (translation != null) {
        translation = stripSomeChars(translation, tmp, config.isRemoveTags(), removeSpaces);
        translation = StringUtil.normalizeUnicode(translation);
    }
    if (config.isSentenceSegmentingEnabled()) {
        List<StringBuilder> spaces = new ArrayList<StringBuilder>();
        List<Rule> brules = new ArrayList<Rule>();
        Language sourceLang = config.getSourceLanguage();
        List<String> segments = Core.getSegmenter().segment(sourceLang, source, spaces, brules);
        if (segments.size() == 1) {
            internalAddSegment(id, (short) 0, segments.get(0), translation, isFuzzy, props, path, protectedParts);
        } else {
            for (short i = 0; i < segments.size(); i++) {
                String onesrc = segments.get(i);
                List<ProtectedPart> segmentProtectedParts = ProtectedPart.extractFor(protectedParts, onesrc);
                internalAddSegment(id, i, onesrc, null, false, props, path, segmentProtectedParts);
            }
        }
    } else {
        internalAddSegment(id, (short) 0, source, translation, isFuzzy, props, path, protectedParts);
    }
}
Also used : ArrayList(java.util.ArrayList) Language(org.omegat.util.Language) Rule(org.omegat.core.segmentation.Rule)

Example 8 with Rule

use of org.omegat.core.segmentation.Rule in project omegat by omegat-org.

the class TranslateEntry method getTranslation.

/**
 * {@inheritDoc}
 */
@Override
public String getTranslation(final String id, final String origSource, final String path) {
    ParseEntry.ParseEntryResult spr = new ParseEntry.ParseEntryResult();
    // fix for bug 3487497;
    // Fetch removed tags if the options
    // has been enabled.
    String tags = null;
    if (config.isRemoveTags()) {
        tags = TagUtil.buildTagListForRemove(origSource);
    }
    boolean removeSpaces = Core.getFilterMaster().getConfig().isRemoveSpacesNonseg();
    final String source = StringUtil.normalizeUnicode(ParseEntry.stripSomeChars(origSource, spr, config.isRemoveTags(), removeSpaces));
    StringBuilder res = new StringBuilder();
    if (config.isSentenceSegmentingEnabled()) {
        boolean translated = false;
        List<StringBuilder> spaces = new ArrayList<StringBuilder>();
        List<Rule> brules = new ArrayList<Rule>();
        Language sourceLang = config.getSourceLanguage();
        Language targetLang = config.getTargetLanguage();
        List<String> segments = Core.getSegmenter().segment(sourceLang, source, spaces, brules);
        for (int i = 0; i < segments.size(); i++) {
            String onesrc = segments.get(i);
            String tr = internalGetSegmentTranslation(id, i, onesrc, path);
            if (tr == null) {
                tr = onesrc;
            } else {
                translated = true;
            }
            segments.set(i, tr);
        }
        if (!translated) {
            // there is no even one translated segment
            return null;
        }
        res.append(Core.getSegmenter().glue(sourceLang, targetLang, segments, spaces, brules));
    } else {
        String tr = internalGetSegmentTranslation(id, 0, source, path);
        if (tr == null) {
            // non-translated
            return null;
        }
        res.append(tr);
    }
    // replacing all occurrences of LF (\n) by either single CR (\r) or CRLF
    // (\r\n)
    // this is a reversal of the process at the beginning of this method
    // fix for bug 1462566
    String r = res.toString();
    // - Word: anything placed before the leading tag is omitted in translated document
    // https://sourceforge.net/p/omegat/bugs/634/
    // This is a Word document, Remove Tags (from Project Properties) is not checked and Remove leading and
    // trailing tags (from File Filters) is not checked
    String fileName = getCurrentFile().toLowerCase(Locale.ENGLISH);
    if ((fileName.endsWith(".docx") || fileName.endsWith(".docm")) && !config.isRemoveTags() && !Core.getFilterMaster().getConfig().isRemoveTags()) {
        // Locate the location of the first tag
        String firstTag = TagUtil.getFirstTag(r);
        if (firstTag != null) {
            int locFirstTag = r.indexOf(firstTag);
            // Is there text before that first tag?
            if (locFirstTag > 0) {
                // Was the first tag between two words without any spaces around?
                String addSpace = "";
                if (!Character.isWhitespace(r.codePointBefore(locFirstTag)) && !Character.isWhitespace(r.codePointAt(locFirstTag + firstTag.length())) && Core.getProject().getProjectProperties().getTargetLanguage().isSpaceDelimited()) {
                    addSpace = " ";
                }
                // Move that first tag before the text, adding a space if needed.
                r = firstTag + r.substring(0, locFirstTag) + addSpace + r.substring(locFirstTag + firstTag.length());
            }
        }
    }
    // the end of the translated string.
    if (config.isRemoveTags()) {
        r += tags;
    }
    if (spr.crlf) {
        r = r.replace("\n", "\r\n");
    } else if (spr.cr) {
        r = r.replace("\n", "\r");
    }
    if (spr.spacesAtBegin > 0) {
        r = origSource.substring(0, spr.spacesAtBegin) + r;
    }
    if (spr.spacesAtEnd > 0) {
        r = r + origSource.substring(origSource.length() - spr.spacesAtEnd);
    }
    return r;
}
Also used : ArrayList(java.util.ArrayList) Language(org.omegat.util.Language) Rule(org.omegat.core.segmentation.Rule)

Aggregations

Rule (org.omegat.core.segmentation.Rule)8 ArrayList (java.util.ArrayList)3 Language (org.omegat.util.Language)3 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Matcher (java.util.regex.Matcher)1 EntryKey (org.omegat.core.data.EntryKey)1 ExternalTMX (org.omegat.core.data.ExternalTMX)1 DefaultTranslationsIterator (org.omegat.core.data.IProject.DefaultTranslationsIterator)1 MultipleTranslationsIterator (org.omegat.core.data.IProject.MultipleTranslationsIterator)1 PrepareTMXEntry (org.omegat.core.data.PrepareTMXEntry)1 SourceTextEntry (org.omegat.core.data.SourceTextEntry)1 TMXEntry (org.omegat.core.data.TMXEntry)1 FuzzyMatcher (org.omegat.core.matching.FuzzyMatcher)1 NearString (org.omegat.core.matching.NearString)1