use of org.omegat.core.segmentation.Rule in project omegat by omegat-org.
the class FindMatches method search.
public List<NearString> search(final String searchText, final boolean requiresTranslation, final boolean fillSimilarityData, final IStopped stop) throws StoppedException {
result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
srcText = searchText;
removedText = "";
// of the translatable text
if (removePattern != null) {
StringBuilder removedBuffer = new StringBuilder();
Matcher removeMatcher = removePattern.matcher(srcText);
while (removeMatcher.find()) {
removedBuffer.append(removeMatcher.group());
}
srcText = removeMatcher.replaceAll("");
removedText = removedBuffer.toString();
}
// get tokens for original string
strTokensStem = tokenizeStem(srcText);
strTokensNoStem = tokenizeNoStem(srcText);
strTokensAll = tokenizeAll(srcText);
// travel by project entries, including orphaned
if (project.getProjectProperties().isSupportDefaultTranslations()) {
project.iterateByDefaultTranslations(new DefaultTranslationsIterator() {
public void iterate(String source, TMXEntry trans) {
checkStopped(stop);
if (!searchExactlyTheSame && source.equals(searchText)) {
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
processEntry(null, source, trans.translation, NearString.MATCH_SOURCE.MEMORY, false, 0, fileName, trans.creator, trans.creationDate, trans.changer, trans.changeDate, null);
}
});
}
project.iterateByMultipleTranslations(new MultipleTranslationsIterator() {
public void iterate(EntryKey source, TMXEntry trans) {
checkStopped(stop);
if (!searchExactlyTheSame && source.sourceText.equals(searchText)) {
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
processEntry(source, source.sourceText, trans.translation, NearString.MATCH_SOURCE.MEMORY, false, 0, fileName, trans.creator, trans.creationDate, trans.changer, trans.changeDate, null);
}
});
// travel by translation memories
for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
int penalty = 0;
Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
if (matcher.find()) {
penalty = Integer.parseInt(matcher.group(1));
}
for (PrepareTMXEntry tmen : en.getValue().getEntries()) {
checkStopped(stop);
if (tmen.source == null) {
// Not all TMX entries have a source; in that case there can be no meaningful match, so skip.
continue;
}
if (requiresTranslation && tmen.translation == null) {
continue;
}
processEntry(null, tmen.source, tmen.translation, NearString.MATCH_SOURCE.TM, false, penalty, en.getKey(), tmen.creator, tmen.creationDate, tmen.changer, tmen.changeDate, tmen.otherProperties);
}
}
// travel by all entries for check source file translations
for (SourceTextEntry ste : project.getAllEntries()) {
checkStopped(stop);
if (ste.getSourceTranslation() != null) {
processEntry(ste.getKey(), ste.getSrcText(), ste.getSourceTranslation(), NearString.MATCH_SOURCE.MEMORY, ste.isSourceTranslationFuzzy(), 0, ste.getKey().file, "", 0, "", 0, null);
}
}
if (separateSegmentMatcher != null) {
// split paragraph even when segmentation disabled, then find matches for every segment
List<StringBuilder> spaces = new ArrayList<StringBuilder>();
List<Rule> brules = new ArrayList<Rule>();
Language sourceLang = project.getProjectProperties().getSourceLanguage();
Language targetLang = project.getProjectProperties().getTargetLanguage();
List<String> segments = Core.getSegmenter().segment(sourceLang, srcText, spaces, brules);
if (segments.size() > 1) {
List<String> fsrc = new ArrayList<String>(segments.size());
List<String> ftrans = new ArrayList<String>(segments.size());
// multiple segments
for (short i = 0; i < segments.size(); i++) {
String onesrc = segments.get(i);
// find match for separate segment
List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation, false, stop);
if (!segmentMatch.isEmpty() && segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
fsrc.add(segmentMatch.get(0).source);
ftrans.add(segmentMatch.get(0).translation);
} else {
fsrc.add("");
ftrans.add("");
}
}
// glue found sources
String foundSrc = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules);
// glue found translations
String foundTrans = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules);
processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM, false, 0, "", "", 0, "", 0, null);
}
}
if (fillSimilarityData) {
// fill similarity data only for result
for (NearString near : result) {
// fix for bug 1586397
byte[] similarityData = FuzzyMatcher.buildSimilarityData(strTokensAll, tokenizeAll(near.source));
near.attr = similarityData;
}
}
return result;
}
use of org.omegat.core.segmentation.Rule in project omegat by omegat-org.
the class ParseEntry method addEntryWithProperties.
/**
* This method is called by filters to add new entry in OmegaT after read it from source file.
*
* @param id
* ID of entry, if format supports it
* @param source
* Translatable source string
* @param translation
* translation of the source string, if format supports it
* @param isFuzzy
* flag for fuzzy translation. If a translation is fuzzy, it is not added to the projects TMX,
* but it is added to the generated 'reference' TMX, a special TMX that is used as extra
* reference during translation.
* @param props
* a staggered array of non-uniquely-identifying key=value properties (metadata) for the entry
* @param path
* path of entry in file
* @param filter
* filter which produces entry
* @param protectedParts
* protected parts
*/
@Override
public void addEntryWithProperties(String id, String source, String translation, boolean isFuzzy, String[] props, String path, IFilter filter, List<ProtectedPart> protectedParts) {
if (StringUtil.isEmpty(source)) {
// empty string - not need to save
return;
}
if (props != null && props.length % 2 != 0) {
throw new IllegalArgumentException("Entry properties must be in a key=value array with an even number of items.");
}
ParseEntryResult tmp = new ParseEntryResult();
boolean removeSpaces = Core.getFilterMaster().getConfig().isRemoveSpacesNonseg();
source = stripSomeChars(source, tmp, config.isRemoveTags(), removeSpaces);
source = StringUtil.normalizeUnicode(source);
if (config.isRemoveTags() && protectedParts != null) {
for (int i = 0; i < protectedParts.size(); i++) {
ProtectedPart p = protectedParts.get(i);
String s = p.getTextInSourceSegment();
s = PatternConsts.OMEGAT_TAG.matcher(s).replaceAll("");
if (s.isEmpty()) {
protectedParts.remove(i);
i--;
} else {
p.setTextInSourceSegment(s);
}
}
}
if (translation != null) {
translation = stripSomeChars(translation, tmp, config.isRemoveTags(), removeSpaces);
translation = StringUtil.normalizeUnicode(translation);
}
if (config.isSentenceSegmentingEnabled()) {
List<StringBuilder> spaces = new ArrayList<StringBuilder>();
List<Rule> brules = new ArrayList<Rule>();
Language sourceLang = config.getSourceLanguage();
List<String> segments = Core.getSegmenter().segment(sourceLang, source, spaces, brules);
if (segments.size() == 1) {
internalAddSegment(id, (short) 0, segments.get(0), translation, isFuzzy, props, path, protectedParts);
} else {
for (short i = 0; i < segments.size(); i++) {
String onesrc = segments.get(i);
List<ProtectedPart> segmentProtectedParts = ProtectedPart.extractFor(protectedParts, onesrc);
internalAddSegment(id, i, onesrc, null, false, props, path, segmentProtectedParts);
}
}
} else {
internalAddSegment(id, (short) 0, source, translation, isFuzzy, props, path, protectedParts);
}
}
use of org.omegat.core.segmentation.Rule in project omegat by omegat-org.
the class TranslateEntry method getTranslation.
/**
* {@inheritDoc}
*/
@Override
public String getTranslation(final String id, final String origSource, final String path) {
ParseEntry.ParseEntryResult spr = new ParseEntry.ParseEntryResult();
// fix for bug 3487497;
// Fetch removed tags if the options
// has been enabled.
String tags = null;
if (config.isRemoveTags()) {
tags = TagUtil.buildTagListForRemove(origSource);
}
boolean removeSpaces = Core.getFilterMaster().getConfig().isRemoveSpacesNonseg();
final String source = StringUtil.normalizeUnicode(ParseEntry.stripSomeChars(origSource, spr, config.isRemoveTags(), removeSpaces));
StringBuilder res = new StringBuilder();
if (config.isSentenceSegmentingEnabled()) {
boolean translated = false;
List<StringBuilder> spaces = new ArrayList<StringBuilder>();
List<Rule> brules = new ArrayList<Rule>();
Language sourceLang = config.getSourceLanguage();
Language targetLang = config.getTargetLanguage();
List<String> segments = Core.getSegmenter().segment(sourceLang, source, spaces, brules);
for (int i = 0; i < segments.size(); i++) {
String onesrc = segments.get(i);
String tr = internalGetSegmentTranslation(id, i, onesrc, path);
if (tr == null) {
tr = onesrc;
} else {
translated = true;
}
segments.set(i, tr);
}
if (!translated) {
// there is no even one translated segment
return null;
}
res.append(Core.getSegmenter().glue(sourceLang, targetLang, segments, spaces, brules));
} else {
String tr = internalGetSegmentTranslation(id, 0, source, path);
if (tr == null) {
// non-translated
return null;
}
res.append(tr);
}
// replacing all occurrences of LF (\n) by either single CR (\r) or CRLF
// (\r\n)
// this is a reversal of the process at the beginning of this method
// fix for bug 1462566
String r = res.toString();
// - Word: anything placed before the leading tag is omitted in translated document
// https://sourceforge.net/p/omegat/bugs/634/
// This is a Word document, Remove Tags (from Project Properties) is not checked and Remove leading and
// trailing tags (from File Filters) is not checked
String fileName = getCurrentFile().toLowerCase(Locale.ENGLISH);
if ((fileName.endsWith(".docx") || fileName.endsWith(".docm")) && !config.isRemoveTags() && !Core.getFilterMaster().getConfig().isRemoveTags()) {
// Locate the location of the first tag
String firstTag = TagUtil.getFirstTag(r);
if (firstTag != null) {
int locFirstTag = r.indexOf(firstTag);
// Is there text before that first tag?
if (locFirstTag > 0) {
// Was the first tag between two words without any spaces around?
String addSpace = "";
if (!Character.isWhitespace(r.codePointBefore(locFirstTag)) && !Character.isWhitespace(r.codePointAt(locFirstTag + firstTag.length())) && Core.getProject().getProjectProperties().getTargetLanguage().isSpaceDelimited()) {
addSpace = " ";
}
// Move that first tag before the text, adding a space if needed.
r = firstTag + r.substring(0, locFirstTag) + addSpace + r.substring(locFirstTag + firstTag.length());
}
}
}
// the end of the translated string.
if (config.isRemoveTags()) {
r += tags;
}
if (spr.crlf) {
r = r.replace("\n", "\r\n");
} else if (spr.cr) {
r = r.replace("\n", "\r");
}
if (spr.spacesAtBegin > 0) {
r = origSource.substring(0, spr.spacesAtBegin) + r;
}
if (spr.spacesAtEnd > 0) {
r = r + origSource.substring(origSource.length() - spr.spacesAtEnd);
}
return r;
}
Aggregations