Search in sources :

Example 66 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class SentenceSourceChecker method run.

private void run(File propFile, Set<String> disabledRules, String langCode, List<String> fileNames, String[] ruleIds, String[] additionalCategoryIds, int maxSentences, int maxErrors, File languageModelDir, Pattern filter) throws IOException {
    Language lang = Languages.getLanguageForShortCode(langCode);
    MultiThreadedJLanguageTool languageTool = new MultiThreadedJLanguageTool(lang);
    languageTool.setCleanOverlappingMatches(false);
    if (languageModelDir != null) {
        languageTool.activateLanguageModelRules(languageModelDir);
    }
    if (ruleIds != null) {
        enableOnlySpecifiedRules(ruleIds, languageTool);
    } else {
        applyRuleDeactivation(languageTool, disabledRules);
    }
    if (filter != null) {
        System.out.println("*** NOTE: only sentences that match regular expression '" + filter + "' will be checked");
    }
    activateAdditionalCategories(additionalCategoryIds, languageTool);
    disableSpellingRules(languageTool);
    System.out.println("Working on: " + StringUtils.join(fileNames, ", "));
    System.out.println("Sentence limit: " + (maxSentences > 0 ? maxSentences : "no limit"));
    System.out.println("Error limit: " + (maxErrors > 0 ? maxErrors : "no limit"));
    ResultHandler resultHandler = null;
    int ruleMatchCount = 0;
    int sentenceCount = 0;
    try {
        if (propFile != null) {
            resultHandler = new DatabaseHandler(propFile, maxSentences, maxErrors);
        } else {
            //resultHandler = new CompactStdoutHandler(maxSentences, maxErrors);
            resultHandler = new StdoutHandler(maxSentences, maxErrors);
        }
        MixingSentenceSource mixingSource = MixingSentenceSource.create(fileNames, lang, filter);
        while (mixingSource.hasNext()) {
            Sentence sentence = mixingSource.next();
            try {
                List<RuleMatch> matches = languageTool.check(sentence.getText());
                resultHandler.handleResult(sentence, matches, lang);
                sentenceCount++;
                if (sentenceCount % 5000 == 0) {
                    System.err.printf("%s sentences checked...\n", NumberFormat.getNumberInstance(Locale.US).format(sentenceCount));
                }
                ruleMatchCount += matches.size();
            } catch (DocumentLimitReachedException | ErrorLimitReachedException e) {
                throw e;
            } catch (Exception e) {
                throw new RuntimeException("Check failed on sentence: " + StringUtils.abbreviate(sentence.getText(), 250), e);
            }
        }
    } catch (DocumentLimitReachedException | ErrorLimitReachedException e) {
        System.out.println(getClass().getSimpleName() + ": " + e);
    } finally {
        languageTool.shutdown();
        if (resultHandler != null) {
            float matchesPerSentence = (float) ruleMatchCount / sentenceCount;
            System.out.printf(lang + ": %d total matches\n", ruleMatchCount);
            System.out.printf(lang + ": ø%.2f rule matches per sentence\n", matchesPerSentence);
            try {
                resultHandler.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
}
Also used : MultiThreadedJLanguageTool(org.languagetool.MultiThreadedJLanguageTool) IOException(java.io.IOException) RuleMatch(org.languagetool.rules.RuleMatch) Language(org.languagetool.Language)

Example 67 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class Searcher method main.

public static void main(String[] args) throws Exception {
    ensureCorrectUsageOrExit(args);
    long startTime = System.currentTimeMillis();
    String[] ruleIds = args[0].split(",");
    String languageCode = args[1];
    Language language = Languages.getLanguageForShortCode(languageCode);
    File indexDir = new File(args[2]);
    boolean limitSearch = !(args.length > 3 && "--no_limit".equals(args[3]));
    Searcher searcher = new Searcher(new SimpleFSDirectory(indexDir.toPath()));
    if (!limitSearch) {
        searcher.setMaxHits(100_000);
    }
    searcher.limitSearch = limitSearch;
    ContextTools contextTools = getContextTools(140);
    int totalMatches = 0;
    for (String ruleId : ruleIds) {
        long ruleStartTime = System.currentTimeMillis();
        for (PatternRule rule : searcher.getRuleById(ruleId, language)) {
            System.out.println("===== " + rule.getFullId() + " =========================================================");
            SearcherResult searcherResult = searcher.findRuleMatchesOnIndex(rule, language);
            int i = 1;
            if (searcherResult.getMatchingSentences().size() == 0) {
                System.out.println("[no matches]");
            }
            for (MatchingSentence ruleMatch : searcherResult.getMatchingSentences()) {
                for (RuleMatch match : ruleMatch.getRuleMatches()) {
                    String context = contextTools.getContext(match.getFromPos(), match.getToPos(), ruleMatch.getSentence());
                    if (WIKITEXT_OUTPUT) {
                        ContextTools contextTools2 = getContextTools(0);
                        String coveredText = contextTools2.getContext(match.getFromPos(), match.getToPos(), ruleMatch.getSentence());
                        coveredText = coveredText.replaceFirst("^\\.\\.\\.", "").replaceFirst("\\.\\.\\.$", "");
                        coveredText = coveredText.replaceFirst("^\\*\\*", "").replaceFirst("\\*\\*$", "");
                        String encodedTextWithQuotes = URLEncoder.encode("\"" + coveredText + "\"", "UTF-8");
                        String searchLink = "https://de.wikipedia.org/w/index.php?search=" + encodedTextWithQuotes + "&title=Spezial%3ASuche&go=Artikel";
                        context = context.replaceAll("\\*\\*.*?\\*\\*", "[" + searchLink + " " + coveredText + "]");
                        String encTitle = URLEncoder.encode(ruleMatch.getTitle(), "UTF-8");
                        String encodedText = URLEncoder.encode(coveredText, "UTF-8");
                        System.out.println("# [[" + ruleMatch.getTitle() + "]]: " + context + " ([http://wikipedia.ramselehof.de/wikiblame.php?user_lang=de&lang=de&project=wikipedia&article=" + encTitle + "&needle=" + encodedText + "&skipversions=0&ignorefirst=0&limit=500&searchmethod=int&order=desc&start=Start WikiBlame])");
                    } else {
                        System.out.println(i + ": " + context + " [" + ruleMatch.getSource() + "]");
                    }
                }
                totalMatches += ruleMatch.getRuleMatches().size();
                i++;
            }
            System.out.println("Time: " + (System.currentTimeMillis() - ruleStartTime) + "ms");
        }
    }
    System.out.println("Total time: " + (System.currentTimeMillis() - startTime) + "ms, " + totalMatches + " matches");
}
Also used : PatternRule(org.languagetool.rules.patterns.PatternRule) SimpleFSDirectory(org.apache.lucene.store.SimpleFSDirectory) ContextTools(org.languagetool.tools.ContextTools) RuleMatch(org.languagetool.rules.RuleMatch) Language(org.languagetool.Language) File(java.io.File)

Example 68 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class ToolsTest method testBitextCheck.

private void testBitextCheck(ResultCache cache) throws IOException, ParserConfigurationException, SAXException {
    Language english = Languages.getLanguageForShortCode("en");
    JLanguageTool srcTool = new JLanguageTool(english, null, cache);
    Language polish = Languages.getLanguageForShortCode("pl");
    JLanguageTool trgTool = new JLanguageTool(polish, null, cache);
    List<BitextRule> rules = Tools.getBitextRules(english, polish);
    int matchCount = Tools.checkBitext("This is a perfectly good sentence.", "To jest całkowicie prawidłowe zdanie.", srcTool, trgTool, rules).size();
    assertEquals(0, matchCount);
    List<RuleMatch> matches1 = Tools.checkBitext("This is not actual.", "To nie jest aktualne.", srcTool, trgTool, rules);
    assertEquals(1, matches1.size());
    assertThat(matches1.get(0).getRule().getId(), is("ACTUAL"));
    assertThat(matches1.get(0).getFromPos(), is(12));
    assertThat(matches1.get(0).getToPos(), is(20));
    List<RuleMatch> matches2 = Tools.checkBitext("A sentence. This is not actual.", "Zdanie. To nie jest aktualne.", srcTool, trgTool, rules);
    assertEquals(1, matches2.size());
    assertThat(matches2.get(0).getRule().getId(), is("ACTUAL"));
    assertThat(matches2.get(0).getFromPos(), is(20));
    assertThat(matches2.get(0).getToPos(), is(28));
    List<RuleMatch> matches3 = Tools.checkBitext("A new sentence. This is not actual.", "Nowa zdanie. To nie jest aktualne.", srcTool, trgTool, rules);
    assertEquals(1, matches3.size());
    assertThat(matches3.get(0).getRule().getId(), is("ACTUAL"));
    assertThat(matches3.get(0).getFromPos(), is(25));
    assertThat(matches3.get(0).getToPos(), is(33));
}
Also used : RuleMatch(org.languagetool.rules.RuleMatch) Language(org.languagetool.Language) JLanguageTool(org.languagetool.JLanguageTool) BitextRule(org.languagetool.rules.bitext.BitextRule)

Example 69 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class WikipediaQuickCheck method getMediaWikiContent.

public String getMediaWikiContent(URL wikipediaUrl) throws IOException {
    Language lang = getLanguage(wikipediaUrl);
    String pageTitle = getPageTitle(wikipediaUrl);
    String apiUrl = "https://" + lang.getShortCode() + ".wikipedia.org/w/api.php?titles=" + URLEncoder.encode(pageTitle, "utf-8") + "&action=query&prop=revisions&rvprop=content|timestamp&format=xml";
    return getContent(new URL(apiUrl));
}
Also used : Language(org.languagetool.Language) URL(java.net.URL)

Example 70 with Language

use of org.languagetool.Language in project languagetool by languagetool-org.

the class POSDictionaryBuilderTest method testExportAndImport.

@Test
@Ignore("for interactive use only")
public void testExportAndImport() throws Exception {
    for (Language language : Languages.get()) {
        String langCode = language.getShortCode();
        File dir = new File("./languagetool-language-modules/" + langCode + "/src/main/resources/org/languagetool/resource/" + langCode);
        File oldBinaryFile = new File(dir, language.getName().toLowerCase() + ".dict");
        File infoFile = new File(dir, language.getName().toLowerCase() + ".info");
        File exportFile = exportDictionaryContents(oldBinaryFile);
        if (exportFile.length() == 0) {
            System.out.println("Zero-size output for " + language + ", skipping dictionary generation");
            exportFile.delete();
            continue;
        }
        POSDictionaryBuilder builder = new POSDictionaryBuilder(infoFile);
        File newBinaryFile = builder.build(exportFile);
        exportFile.delete();
        System.out.println(language + " old binary file size: " + oldBinaryFile.length() + " bytes (" + oldBinaryFile.getName() + ")");
        System.out.println(language + " new binary file size: " + newBinaryFile.length() + " bytes (" + newBinaryFile.getAbsolutePath() + ")");
        // comment in to copy the new files over the old ones:
        /*boolean b = newBinaryFile.renameTo(oldBinaryFile);
      if (!b) {
        throw new RuntimeException("Could not rename" + newBinaryFile.getAbsolutePath() + " to " + oldBinaryFile.getCanonicalPath());
      }*/
        System.out.println("");
    }
}
Also used : Language(org.languagetool.Language) File(java.io.File) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

Language (org.languagetool.Language)84 Test (org.junit.Test)23 File (java.io.File)15 ArrayList (java.util.ArrayList)12 JLanguageTool (org.languagetool.JLanguageTool)11 Rule (org.languagetool.rules.Rule)11 RuleMatch (org.languagetool.rules.RuleMatch)10 IOException (java.io.IOException)7 Ignore (org.junit.Ignore)6 StringTools.readerToString (org.languagetool.tools.StringTools.readerToString)5 InputStream (java.io.InputStream)4 English (org.languagetool.language.English)4 BitextRule (org.languagetool.rules.bitext.BitextRule)4 URL (java.net.URL)3 HashSet (java.util.HashSet)3 MultiThreadedJLanguageTool (org.languagetool.MultiThreadedJLanguageTool)3 AmericanEnglish (org.languagetool.language.AmericanEnglish)3 LanguageModel (org.languagetool.languagemodel.LanguageModel)3 LuceneLanguageModel (org.languagetool.languagemodel.LuceneLanguageModel)3 BufferedReader (java.io.BufferedReader)2