use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class MatchState method toFinalString.
/**
* Gets all strings formatted using the match element.
*/
public final String[] toFinalString(Language lang) throws IOException {
String[] formattedString = new String[1];
if (formattedToken != null) {
int readingCount = formattedToken.getReadingsLength();
formattedString[0] = formattedToken.getToken();
Pattern pRegexMatch = match.getRegexMatch();
String regexReplace = match.getRegexReplace();
if (pRegexMatch != null) {
formattedString[0] = pRegexMatch.matcher(formattedString[0]).replaceAll(regexReplace);
}
String posTag = match.getPosTag();
if (posTag != null) {
if (synthesizer == null) {
formattedString[0] = formattedToken.getToken();
} else if (match.isPostagRegexp()) {
TreeSet<String> wordForms = new TreeSet<>();
boolean oneForm = false;
for (int k = 0; k < readingCount; k++) {
if (formattedToken.getAnalyzedToken(k).getLemma() == null) {
String posUnique = formattedToken.getAnalyzedToken(k).getPOSTag();
if (posUnique == null) {
wordForms.add(formattedToken.getToken());
oneForm = true;
} else {
if (SENTENCE_START_TAGNAME.equals(posUnique) || SENTENCE_END_TAGNAME.equals(posUnique) || PARAGRAPH_END_TAGNAME.equals(posUnique)) {
if (!oneForm) {
wordForms.add(formattedToken.getToken());
}
oneForm = true;
} else {
oneForm = false;
}
}
}
}
String targetPosTag = getTargetPosTag();
if (!oneForm) {
for (int i = 0; i < readingCount; i++) {
String[] possibleWordForms = synthesizer.synthesize(formattedToken.getAnalyzedToken(i), targetPosTag, true);
if (possibleWordForms != null) {
wordForms.addAll(Arrays.asList(possibleWordForms));
}
}
}
if (wordForms.isEmpty()) {
if (match.checksSpelling()) {
formattedString[0] = "";
} else {
formattedString[0] = "(" + formattedToken.getToken() + ")";
}
} else {
formattedString = wordForms.toArray(new String[wordForms.size()]);
}
} else {
TreeSet<String> wordForms = new TreeSet<>();
for (int i = 0; i < readingCount; i++) {
String[] possibleWordForms = synthesizer.synthesize(formattedToken.getAnalyzedToken(i), posTag);
if (possibleWordForms != null) {
wordForms.addAll(Arrays.asList(possibleWordForms));
}
}
formattedString = wordForms.toArray(new String[wordForms.size()]);
}
}
}
String original;
if (match.isStaticLemma()) {
original = matchedToken != null ? matchedToken.getToken() : "";
} else {
original = formattedToken != null ? formattedToken.getToken() : "";
}
for (int i = 0; i < formattedString.length; i++) {
formattedString[i] = convertCase(formattedString[i], original, lang);
}
// TODO should case conversion happen before or after including skipped tokens?
IncludeRange includeSkipped = match.getIncludeSkipped();
if (includeSkipped != IncludeRange.NONE && skippedTokens != null && !skippedTokens.isEmpty()) {
String[] helper = new String[formattedString.length];
for (int i = 0; i < formattedString.length; i++) {
if (formattedString[i] == null) {
formattedString[i] = "";
}
helper[i] = formattedString[i] + skippedTokens;
}
formattedString = helper;
}
if (match.checksSpelling() && lang != null) {
List<String> formattedStringElements = Arrays.asList(formattedString);
// tagger-based speller
List<AnalyzedTokenReadings> analyzed = lang.getTagger().tag(formattedStringElements);
for (int i = 0; i < formattedString.length; i++) {
AnalyzedToken analyzedToken = analyzed.get(i).getAnalyzedToken(0);
if (analyzedToken.getLemma() == null && analyzedToken.hasNoTag()) {
formattedString[i] = "";
}
}
}
return formattedString;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class DisambiguationPatternRuleReplacer method executeAction.
private AnalyzedTokenReadings[] executeAction(AnalyzedSentence sentence, AnalyzedTokenReadings[] whiteTokens, AnalyzedTokenReadings[] unifiedTokens, int firstMatchToken, int lastMatchToken, int matchingTokens, int[] tokenPositions) {
AnalyzedTokenReadings[] whTokens = whiteTokens.clone();
DisambiguationPatternRule rule = (DisambiguationPatternRule) this.rule;
int correctedStPos = 0;
int startPositionCorrection = rule.getStartPositionCorrection();
int endPositionCorrection = rule.getEndPositionCorrection();
int matchingTokensWithCorrection = matchingTokens;
List<Integer> tokenPositionList = new ArrayList<>();
for (int i : tokenPositions) {
tokenPositionList.add(i);
}
if (startPositionCorrection > 0) {
//token positions are shifted by 1
correctedStPos--;
for (int j = 0; j < pTokensMatched.size(); j++) {
if (!pTokensMatched.get(j)) {
// add zero-length token corresponding to the non-matching pattern element so that position count is fine
tokenPositionList.add(j, 0);
}
}
for (int l = 0; l <= startPositionCorrection && tokenPositionList.size() > l; l++) {
correctedStPos += tokenPositionList.get(l);
}
// adjust to make sure the token count is fine as it's checked later
int w = startPositionCorrection;
for (int j = 0; j <= w; j++) {
if (j < pTokensMatched.size() && !pTokensMatched.get(j)) {
startPositionCorrection--;
}
}
}
if (endPositionCorrection < 0) {
// adjust the end position correction if one of the elements has not been matched
for (int d = startPositionCorrection; d < pTokensMatched.size(); d++) {
if (!pTokensMatched.get(d)) {
endPositionCorrection++;
}
}
}
if (lastMatchToken != -1) {
int maxPosCorrection = Math.max((lastMatchToken + 1 - (firstMatchToken + correctedStPos)) - matchingTokens, 0);
matchingTokensWithCorrection += maxPosCorrection;
}
int fromPos = sentence.getOriginalPosition(firstMatchToken + correctedStPos);
boolean spaceBefore = whTokens[fromPos].isWhitespaceBefore();
DisambiguationPatternRule.DisambiguatorAction disAction = rule.getAction();
AnalyzedToken[] newTokenReadings = rule.getNewTokenReadings();
Match matchElement = rule.getMatchElement();
String disambiguatedPOS = rule.getDisambiguatedPOS();
switch(disAction) {
case UNIFY:
if (unifiedTokens != null) {
//TODO: unifiedTokens.length is larger > matchingTokensWithCorrection in cases where there are no markers...
if (unifiedTokens.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
if (whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + unifiedTokens.length - 1)].isSentenceEnd()) {
unifiedTokens[unifiedTokens.length - 1].setSentEnd();
}
for (int i = 0; i < unifiedTokens.length; i++) {
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
unifiedTokens[i].setStartPos(whTokens[position].getStartPos());
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
List<ChunkTag> chTags = whTokens[position].getChunkTags();
whTokens[position] = unifiedTokens[i];
whTokens[position].setChunkTags(chTags);
annotateChange(whTokens[position], prevValue, prevAnot);
}
}
}
break;
case REMOVE:
if (newTokenReadings != null && newTokenReadings.length > 0) {
if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
for (int i = 0; i < newTokenReadings.length; i++) {
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
whTokens[position].removeReading(newTokenReadings[i]);
annotateChange(whTokens[position], prevValue, prevAnot);
}
}
} else if (!StringTools.isEmpty(disambiguatedPOS)) {
// negative filtering
Pattern p = Pattern.compile(disambiguatedPOS);
AnalyzedTokenReadings tmp = new AnalyzedTokenReadings(whTokens[fromPos].getReadings(), whTokens[fromPos].getStartPos());
for (AnalyzedToken analyzedToken : tmp) {
if (analyzedToken.getPOSTag() != null) {
Matcher mPos = p.matcher(analyzedToken.getPOSTag());
if (mPos.matches()) {
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos);
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
whTokens[position].removeReading(analyzedToken);
annotateChange(whTokens[position], prevValue, prevAnot);
}
}
}
}
break;
case ADD:
if (newTokenReadings != null) {
if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
for (int i = 0; i < newTokenReadings.length; i++) {
String token;
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
if (newTokenReadings[i].getToken().isEmpty()) {
token = whTokens[position].getToken();
} else {
token = newTokenReadings[i].getToken();
}
String lemma;
if (newTokenReadings[i].getLemma() == null) {
lemma = token;
} else {
lemma = newTokenReadings[i].getLemma();
}
AnalyzedToken newTok = new AnalyzedToken(token, newTokenReadings[i].getPOSTag(), lemma);
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
whTokens[position].addReading(newTok);
annotateChange(whTokens[position], prevValue, prevAnot);
}
}
}
break;
case FILTERALL:
for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
PatternToken pToken;
if (pTokensMatched.get(i + startPositionCorrection)) {
pToken = rule.getPatternTokens().get(i + startPositionCorrection);
} else {
int k = 1;
while (i + startPositionCorrection + k < rule.getPatternTokens().size() + endPositionCorrection && !pTokensMatched.get(i + startPositionCorrection + k)) {
k++;
}
pToken = rule.getPatternTokens().get(i + k + startPositionCorrection);
}
Match tmpMatchToken = new Match(pToken.getPOStag(), null, true, pToken.getPOStag(), null, Match.CaseConversion.NONE, false, false, Match.IncludeRange.NONE);
MatchState matchState = tmpMatchToken.createState(rule.getLanguage().getSynthesizer(), whTokens[position]);
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
whTokens[position] = matchState.filterReadings();
annotateChange(whTokens[position], prevValue, prevAnot);
}
break;
case IMMUNIZE:
for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + i)].immunize();
}
break;
case IGNORE_SPELLING:
for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + i)].ignoreSpelling();
}
break;
case FILTER:
if (matchElement == null) {
// same as REPLACE if using <match>
Match tmpMatchToken = new Match(disambiguatedPOS, null, true, disambiguatedPOS, null, Match.CaseConversion.NONE, false, false, Match.IncludeRange.NONE);
boolean newPOSmatches = false;
// only apply filter rule when it matches previous tags:
for (int i = 0; i < whTokens[fromPos].getReadingsLength(); i++) {
if (!whTokens[fromPos].getAnalyzedToken(i).hasNoTag() && whTokens[fromPos].getAnalyzedToken(i).getPOSTag() != null && whTokens[fromPos].getAnalyzedToken(i).getPOSTag().matches(disambiguatedPOS)) {
newPOSmatches = true;
break;
}
}
if (newPOSmatches) {
MatchState matchState = tmpMatchToken.createState(rule.getLanguage().getSynthesizer(), whTokens[fromPos]);
String prevValue = whTokens[fromPos].toString();
String prevAnot = whTokens[fromPos].getHistoricalAnnotations();
whTokens[fromPos] = matchState.filterReadings();
annotateChange(whTokens[fromPos], prevValue, prevAnot);
}
break;
}
//fallthrough
case REPLACE:
default:
if (newTokenReadings != null && newTokenReadings.length > 0) {
if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
for (int i = 0; i < newTokenReadings.length; i++) {
String token;
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
if ("".equals(newTokenReadings[i].getToken())) {
// empty token
token = whTokens[position].getToken();
} else {
token = newTokenReadings[i].getToken();
}
String lemma;
if (newTokenReadings[i].getLemma() == null) {
// empty lemma
lemma = token;
} else {
lemma = newTokenReadings[i].getLemma();
}
AnalyzedToken analyzedToken = new AnalyzedToken(token, newTokenReadings[i].getPOSTag(), lemma);
AnalyzedTokenReadings toReplace = new AnalyzedTokenReadings(analyzedToken, whTokens[fromPos].getStartPos());
whTokens[position] = replaceTokens(whTokens[position], toReplace);
}
}
} else if (matchElement == null) {
String lemma = "";
for (AnalyzedToken analyzedToken : whTokens[fromPos]) {
if (analyzedToken.getPOSTag() != null && analyzedToken.getPOSTag().equals(disambiguatedPOS) && analyzedToken.getLemma() != null) {
lemma = analyzedToken.getLemma();
}
}
if (StringTools.isEmpty(lemma)) {
lemma = whTokens[fromPos].getAnalyzedToken(0).getLemma();
}
AnalyzedToken analyzedToken = new AnalyzedToken(whTokens[fromPos].getToken(), disambiguatedPOS, lemma);
AnalyzedTokenReadings toReplace = new AnalyzedTokenReadings(analyzedToken, whTokens[fromPos].getStartPos());
whTokens[fromPos] = replaceTokens(whTokens[fromPos], toReplace);
} else {
// using the match element
MatchState matchElementState = matchElement.createState(rule.getLanguage().getSynthesizer(), whTokens[fromPos]);
String prevValue = whTokens[fromPos].toString();
String prevAnot = whTokens[fromPos].getHistoricalAnnotations();
whTokens[fromPos] = matchElementState.filterReadings();
whTokens[fromPos].setWhitespaceBefore(spaceBefore);
annotateChange(whTokens[fromPos], prevValue, prevAnot);
}
}
return whTokens;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class PortugueseAccentuationDataLoader method loadWords.
Map<String, AnalyzedTokenReadings> loadWords(String path) {
final Map<String, AnalyzedTokenReadings> map = new HashMap<>();
final InputStream inputStream = JLanguageTool.getDataBroker().getFromRulesDirAsStream(path);
try (Scanner scanner = new Scanner(inputStream, FILE_ENCODING)) {
while (scanner.hasNextLine()) {
final String line = scanner.nextLine().trim();
if (line.isEmpty() || line.charAt(0) == '#') {
// ignore comments
continue;
}
final String[] parts = line.split(";");
if (parts.length != 3) {
throw new RuntimeException("Format error in file " + path + ", line: " + line + ", " + "expected 3 semicolon-separated parts, got " + parts.length);
}
final AnalyzedToken analyzedToken = new AnalyzedToken(parts[1], parts[2], null);
map.put(parts[0], new AnalyzedTokenReadings(analyzedToken, 0));
}
}
return map;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class RussianTagger method tag.
@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
int pos = 0;
for (String word : sentenceTokens) {
if (word.length() > 1) {
word = word.replace("о́", "о");
word = word.replace("а́", "а");
word = word.replace("е́", "е");
word = word.replace("у́", "у");
word = word.replace("и́", "и");
word = word.replace("ы́", "ы");
word = word.replace("э́", "э");
word = word.replace("ю́", "ю");
word = word.replace("я́", "я");
word = word.replace("о̀", "о");
word = word.replace("а̀", "а");
word = word.replace("ѐ", "е");
word = word.replace("у̀", "у");
word = word.replace("ѝ", "и");
word = word.replace("ы̀", "ы");
word = word.replace("э̀", "э");
word = word.replace("ю̀", "ю");
word = word.replace("я̀", "я");
word = word.replace("ʼ", "ъ");
}
List<AnalyzedToken> l = getAnalyzedTokens(word);
tokenReadings.add(new AnalyzedTokenReadings(l, pos));
pos += word.length();
}
return tokenReadings;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class RuleFilterEvaluatorTest method testGetResolvedArguments.
@Test
public void testGetResolvedArguments() throws Exception {
AnalyzedTokenReadings[] readingsList = { new AnalyzedTokenReadings(new AnalyzedToken("fake1", "pos", null), 0), new AnalyzedTokenReadings(new AnalyzedToken("fake2", "pos", null), 0) };
Map<String, String> map = eval.getResolvedArguments("year:\\1 month:\\2", readingsList, Arrays.asList(1, 1));
assertThat(map.get("year"), is("fake1"));
assertThat(map.get("month"), is("fake2"));
assertThat(map.size(), is(2));
}
Aggregations