use of org.omegat.core.matching.NearString in project omegat by omegat-org.
the class CalcMatchStatistics method calcMaxSimilarity.
int calcMaxSimilarity(SourceTextEntry ste) {
String srcNoXmlTags = removeXmlTags(ste);
FindMatches localFinder = finder.get();
List<NearString> nears = localFinder.search(srcNoXmlTags, true, false, this::isInterrupted);
final Token[] strTokensStem = localFinder.tokenizeAll(ste.getSrcText());
int maxSimilarity = 0;
CACHE: for (NearString near : nears) {
final Token[] candTokens = localFinder.tokenizeAll(near.source);
int newSimilarity = FuzzyMatcher.calcSimilarity(distanceCalculator.get(), strTokensStem, candTokens);
if (near.fuzzyMark) {
newSimilarity -= FindMatches.PENALTY_FOR_FUZZY;
}
if (newSimilarity > maxSimilarity) {
maxSimilarity = newSimilarity;
if (newSimilarity >= 95) {
// enough to say that we are in row 2
break CACHE;
}
}
}
return maxSimilarity;
}
use of org.omegat.core.matching.NearString in project omegat by omegat-org.
the class FindMatches method haveChanceToAdd.
/**
* Check if entry have a chance to be added to result list. If no, there is no sense to calculate other
* parameters.
*
* @param simStem
* similarity with stemming
* @param simNoStem
* similarity without stemming
* @param simExactly
* exactly similarity
* @return true if we have chance
*/
protected boolean haveChanceToAdd(final int simStem, final int simNoStem, final int simExactly) {
if (simStem < OConsts.FUZZY_MATCH_THRESHOLD && simNoStem < OConsts.FUZZY_MATCH_THRESHOLD) {
return false;
}
if (result.size() < maxCount) {
return true;
}
NearString st = result.get(result.size() - 1);
int chance = Integer.compare(st.scores[0].score, simStem);
if (chance == 0) {
chance = Integer.compare(st.scores[0].scoreNoStem, simNoStem);
}
if (chance == 0) {
chance = Integer.compare(st.scores[0].adjustedScore, simExactly);
}
return chance != 1;
}
use of org.omegat.core.matching.NearString in project omegat by omegat-org.
the class FindMatches method addNearString.
/**
* Add near string into result list. Near strings sorted by "similarity,simAdjusted"
*/
protected void addNearString(final EntryKey key, final String source, final String translation, NearString.MATCH_SOURCE comesFrom, final boolean fuzzy, final int similarity, final int similarityNoStem, final int simAdjusted, final byte[] similarityData, final String tmxName, final String creator, final long creationDate, final String changer, final long changedDate, final List<TMXProp> tuProperties) {
// find position for new data
int pos = 0;
for (int i = 0; i < result.size(); i++) {
NearString st = result.get(i);
if (source.equals(st.source) && Objects.equals(translation, st.translation)) {
// Consolidate identical matches from different sources into a single NearString with
// multiple project entries.
result.set(i, NearString.merge(st, key, source, translation, comesFrom, fuzzy, similarity, similarityNoStem, simAdjusted, similarityData, tmxName, creator, creationDate, changer, changedDate, tuProperties));
return;
}
if (st.scores[0].score < similarity) {
break;
}
if (st.scores[0].score == similarity) {
if (st.scores[0].scoreNoStem < similarityNoStem) {
break;
}
if (st.scores[0].scoreNoStem == similarityNoStem) {
if (st.scores[0].adjustedScore < simAdjusted) {
break;
}
// text with the same case has precedence
if (similarity == 100 && !st.source.equals(srcText) && source.equals(srcText)) {
break;
}
}
}
pos = i + 1;
}
result.add(pos, new NearString(key, source, translation, comesFrom, fuzzy, similarity, similarityNoStem, simAdjusted, similarityData, tmxName, creator, creationDate, changer, changedDate, tuProperties));
if (result.size() > maxCount) {
result.remove(result.size() - 1);
}
}
use of org.omegat.core.matching.NearString in project omegat by omegat-org.
the class FindMatches method search.
public List<NearString> search(final String searchText, final boolean requiresTranslation, final boolean fillSimilarityData, final IStopped stop) throws StoppedException {
result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
srcText = searchText;
removedText = "";
// of the translatable text
if (removePattern != null) {
StringBuilder removedBuffer = new StringBuilder();
Matcher removeMatcher = removePattern.matcher(srcText);
while (removeMatcher.find()) {
removedBuffer.append(removeMatcher.group());
}
srcText = removeMatcher.replaceAll("");
removedText = removedBuffer.toString();
}
// get tokens for original string
strTokensStem = tokenizeStem(srcText);
strTokensNoStem = tokenizeNoStem(srcText);
strTokensAll = tokenizeAll(srcText);
// travel by project entries, including orphaned
if (project.getProjectProperties().isSupportDefaultTranslations()) {
project.iterateByDefaultTranslations(new DefaultTranslationsIterator() {
public void iterate(String source, TMXEntry trans) {
checkStopped(stop);
if (!searchExactlyTheSame && source.equals(searchText)) {
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
processEntry(null, source, trans.translation, NearString.MATCH_SOURCE.MEMORY, false, 0, fileName, trans.creator, trans.creationDate, trans.changer, trans.changeDate, null);
}
});
}
project.iterateByMultipleTranslations(new MultipleTranslationsIterator() {
public void iterate(EntryKey source, TMXEntry trans) {
checkStopped(stop);
if (!searchExactlyTheSame && source.sourceText.equals(searchText)) {
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
processEntry(source, source.sourceText, trans.translation, NearString.MATCH_SOURCE.MEMORY, false, 0, fileName, trans.creator, trans.creationDate, trans.changer, trans.changeDate, null);
}
});
// travel by translation memories
for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
int penalty = 0;
Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
if (matcher.find()) {
penalty = Integer.parseInt(matcher.group(1));
}
for (PrepareTMXEntry tmen : en.getValue().getEntries()) {
checkStopped(stop);
if (tmen.source == null) {
// Not all TMX entries have a source; in that case there can be no meaningful match, so skip.
continue;
}
if (requiresTranslation && tmen.translation == null) {
continue;
}
processEntry(null, tmen.source, tmen.translation, NearString.MATCH_SOURCE.TM, false, penalty, en.getKey(), tmen.creator, tmen.creationDate, tmen.changer, tmen.changeDate, tmen.otherProperties);
}
}
// travel by all entries for check source file translations
for (SourceTextEntry ste : project.getAllEntries()) {
checkStopped(stop);
if (ste.getSourceTranslation() != null) {
processEntry(ste.getKey(), ste.getSrcText(), ste.getSourceTranslation(), NearString.MATCH_SOURCE.MEMORY, ste.isSourceTranslationFuzzy(), 0, ste.getKey().file, "", 0, "", 0, null);
}
}
if (separateSegmentMatcher != null) {
// split paragraph even when segmentation disabled, then find matches for every segment
List<StringBuilder> spaces = new ArrayList<StringBuilder>();
List<Rule> brules = new ArrayList<Rule>();
Language sourceLang = project.getProjectProperties().getSourceLanguage();
Language targetLang = project.getProjectProperties().getTargetLanguage();
List<String> segments = Core.getSegmenter().segment(sourceLang, srcText, spaces, brules);
if (segments.size() > 1) {
List<String> fsrc = new ArrayList<String>(segments.size());
List<String> ftrans = new ArrayList<String>(segments.size());
// multiple segments
for (short i = 0; i < segments.size(); i++) {
String onesrc = segments.get(i);
// find match for separate segment
List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation, false, stop);
if (!segmentMatch.isEmpty() && segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
fsrc.add(segmentMatch.get(0).source);
ftrans.add(segmentMatch.get(0).translation);
} else {
fsrc.add("");
ftrans.add("");
}
}
// glue found sources
String foundSrc = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules);
// glue found translations
String foundTrans = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules);
processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM, false, 0, "", "", 0, "", 0, null);
}
}
if (fillSimilarityData) {
// fill similarity data only for result
for (NearString near : result) {
// fix for bug 1586397
byte[] similarityData = FuzzyMatcher.buildSimilarityData(strTokensAll, tokenizeAll(near.source));
near.attr = similarityData;
}
}
return result;
}
use of org.omegat.core.matching.NearString in project omegat by omegat-org.
the class MainWindow method doRecycleTrans.
/**
* replace entire edit area with active fuzzy match or selection
*/
public void doRecycleTrans() {
if (!Core.getProject().isProjectLoaded()) {
return;
}
String selection = getSelectedTextInMatcher();
if (!StringUtil.isEmpty(selection)) {
Core.getEditor().replaceEditText(selection);
Core.getEditor().requestFocus();
return;
}
NearString near = Core.getMatcher().getActiveMatch();
if (near != null) {
String translation = near.translation;
if (Preferences.isPreference(Preferences.CONVERT_NUMBERS)) {
translation = Core.getMatcher().substituteNumbers(Core.getEditor().getCurrentEntry().getSrcText(), near.source, near.translation);
}
if (near.comesFrom == NearString.MATCH_SOURCE.TM && FileUtil.isInPath(new File(Core.getProject().getProjectProperties().getTMRoot(), "mt"), new File(near.projs[0]))) {
Core.getEditor().replaceEditTextAndMark(translation);
} else {
Core.getEditor().replaceEditText(translation);
}
Core.getEditor().requestFocus();
}
}
Aggregations