Search in sources :

Example 1 with PosTagSet

use of com.joliciel.talismane.posTagger.PosTagSet in project talismane by joliciel-informatique.

the class PosTagSetFeature method checkInternal.

@Override
public FeatureResult<List<WeightedOutcome<String>>> checkInternal(TokenWrapper context, RuntimeEnvironment env) {
    PosTagSet posTagSet = TalismaneSession.get(sessionId).getPosTagSet();
    Set<PosTag> posTags = posTagSet.getTags();
    List<WeightedOutcome<String>> resultList = new ArrayList<WeightedOutcome<String>>();
    for (PosTag posTag : posTags) {
        resultList.add(new WeightedOutcome<String>(posTag.getCode(), 1.0));
    }
    return this.generateResult(resultList);
}
Also used : PosTagSet(com.joliciel.talismane.posTagger.PosTagSet) PosTag(com.joliciel.talismane.posTagger.PosTag) ArrayList(java.util.ArrayList) WeightedOutcome(com.joliciel.talismane.utils.WeightedOutcome)

Example 2 with PosTagSet

use of com.joliciel.talismane.posTagger.PosTagSet in project talismane by joliciel-informatique.

the class PosTaggerStatisticsWriter method onCompleteAnalysis.

@Override
public void onCompleteAnalysis() throws IOException {
    if (writer != null) {
        PosTagSet posTagSet = TalismaneSession.get(sessionId).getPosTagSet();
        for (PosTag posTag : posTagSet.getTags()) {
            if (!stats.posTagCounts.containsKey(posTag.getCode())) {
                stats.posTagCounts.put(posTag.getCode(), 0);
            }
        }
        double unknownLexiconPercent = 1;
        if (referenceStats != null) {
            int unknownLexiconCount = 0;
            for (String word : stats.words) {
                if (!referenceStats.words.contains(word))
                    unknownLexiconCount++;
            }
            unknownLexiconPercent = (double) unknownLexiconCount / (double) stats.words.size();
        }
        double unknownLowercaseLexiconPercent = 1;
        if (referenceStats != null) {
            int unknownLowercaseLexiconCount = 0;
            for (String lowercase : stats.lowerCaseWords) {
                if (!referenceStats.lowerCaseWords.contains(lowercase))
                    unknownLowercaseLexiconCount++;
            }
            unknownLowercaseLexiconPercent = (double) unknownLowercaseLexiconCount / (double) stats.lowerCaseWords.size();
        }
        writer.write(CSV.format("sentenceCount") + CSV.format(stats.sentenceCount) + "\n");
        writer.write(CSV.format("sentenceLengthMean") + CSV.format(stats.sentenceLengthStats.getMean()) + "\n");
        writer.write(CSV.format("sentenceLengthStdDev") + CSV.format(stats.sentenceLengthStats.getStandardDeviation()) + "\n");
        writer.write(CSV.format("lexiconSize") + CSV.format(stats.words.size()) + "\n");
        writer.write(CSV.format("lexiconUnknownInRefCorpus") + CSV.format(unknownLexiconPercent * 100.0) + "\n");
        writer.write(CSV.format("tokenCount") + CSV.format(stats.tokenCount) + "\n");
        double unknownTokenPercent = ((double) stats.unknownTokenCount / (double) stats.tokenCount) * 100.0;
        writer.write(CSV.format("tokenUnknownInRefCorpus") + CSV.format(unknownTokenPercent) + "\n");
        double unknownInLexiconPercent = ((double) stats.unknownInLexiconCount / (double) stats.tokenCount) * 100.0;
        writer.write(CSV.format("tokenUnknownInRefLexicon") + CSV.format(unknownInLexiconPercent) + "\n");
        writer.write(CSV.format("lowercaseLexiconSize") + CSV.format(stats.lowerCaseWords.size()) + "\n");
        writer.write(CSV.format("lowercaseLexiconUnknownInRefCorpus") + CSV.format(unknownLowercaseLexiconPercent * 100.0) + "\n");
        writer.write(CSV.format("alphanumericCount") + CSV.format(stats.alphanumericCount) + "\n");
        double unknownAlphanumericPercent = ((double) stats.unknownAlphanumericCount / (double) stats.alphanumericCount) * 100.0;
        writer.write(CSV.format("alphaUnknownInRefCorpus") + CSV.format(unknownAlphanumericPercent) + "\n");
        double unknownAlphaInLexiconPercent = ((double) stats.unknownAlphaInLexiconCount / (double) stats.alphanumericCount) * 100.0;
        writer.write(CSV.format("alphaUnknownInRefLexicon") + CSV.format(unknownAlphaInLexiconPercent) + "\n");
        writer.write(CSV.format("openClassCount") + CSV.format(stats.openClassCount) + "\n");
        double openClassUnknownPercent = ((double) stats.openClassUnknownInRefCorpus / (double) stats.openClassCount) * 100.0;
        writer.write(CSV.format("openClassUnknownInRefCorpus") + CSV.format(openClassUnknownPercent) + "\n");
        double openClassUnknownInLexiconPercent = ((double) stats.openClassUnknownInLexicon / (double) stats.openClassCount) * 100.0;
        writer.write(CSV.format("openClassUnknownInRefLexicon") + CSV.format(openClassUnknownInLexiconPercent) + "\n");
        writer.write(CSV.format("closedClassCount") + CSV.format(stats.closedClassCount) + "\n");
        double closedClassUnknownPercent = ((double) stats.closedClassUnknownInRefCorpus / (double) stats.closedClassCount) * 100.0;
        writer.write(CSV.format("closedClassUnknownInRefCorpus") + CSV.format(closedClassUnknownPercent) + "\n");
        double closedClassUnknownInLexiconPercent = ((double) stats.closedClassUnknownInLexicon / (double) stats.closedClassCount) * 100.0;
        writer.write(CSV.format("closedClassUnknownInRefLexicon") + CSV.format(closedClassUnknownInLexiconPercent) + "\n");
        for (String posTag : stats.posTagCounts.keySet()) {
            int count = stats.posTagCounts.get(posTag);
            writer.write(CSV.format(posTag) + CSV.format(count) + CSV.format(((double) count / (double) stats.tokenCount) * 100.0) + "\n");
        }
        writer.flush();
        writer.close();
    }
    if (this.serializationFile != null) {
        ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(serializationFile, false));
        zos.putNextEntry(new ZipEntry("Contents.obj"));
        ObjectOutputStream oos = new ObjectOutputStream(zos);
        try {
            oos.writeObject(stats);
        } finally {
            oos.flush();
        }
        zos.flush();
        zos.close();
    }
}
Also used : PosTagSet(com.joliciel.talismane.posTagger.PosTagSet) PosTag(com.joliciel.talismane.posTagger.PosTag) ZipOutputStream(java.util.zip.ZipOutputStream) FileOutputStream(java.io.FileOutputStream) ZipEntry(java.util.zip.ZipEntry) ObjectOutputStream(java.io.ObjectOutputStream)

Example 3 with PosTagSet

use of com.joliciel.talismane.posTagger.PosTagSet in project talismane by joliciel-informatique.

the class SpmrlConverter method main.

public static void main(String[] args) throws Exception {
    Map<String, String> argMap = StringUtils.convertArgs(args);
    String logConfigPath = argMap.get("logConfigFile");
    argMap.remove("logConfigFile");
    if (logConfigPath != null)
        LogUtils.configureLogging(logConfigPath);
    String spmrlPath = "";
    String suffix = "tal";
    boolean compressCompounds = true;
    boolean convertCompounds = false;
    String inDirPath = null;
    String outDirPath = null;
    String inSuffix = ".conll";
    String posTagSetPath = null;
    for (String argName : argMap.keySet()) {
        String argValue = argMap.get(argName);
        if (argName.equals("inFile")) {
            spmrlPath = argValue;
        } else if (argName.equals("inDir")) {
            inDirPath = argValue;
        } else if (argName.equals("outDir")) {
            outDirPath = argValue;
        } else if (argName.equals("inSuffix")) {
            inSuffix = argValue;
        } else if (argName.equals("suffix")) {
            suffix = argValue;
        } else if (argName.equals("compressCompounds")) {
            compressCompounds = argValue.equalsIgnoreCase("true");
        } else if (argName.equals("convertCompounds")) {
            convertCompounds = argValue.equalsIgnoreCase("true");
        } else if (argName.equals("posTagSet")) {
            posTagSetPath = argValue;
        } else {
            throw new RuntimeException("Unknown option: " + argName);
        }
    }
    if (!inSuffix.startsWith("."))
        inSuffix = "." + inSuffix;
    final String inSuffixFinal = inSuffix;
    List<File> inFiles = new ArrayList<File>();
    if (inDirPath != null) {
        File inDir = new File(inDirPath);
        File[] inFileArray = inDir.listFiles(new FilenameFilter() {

            @Override
            public boolean accept(File dir, String name) {
                return name.endsWith(inSuffixFinal);
            }
        });
        for (File file : inFileArray) inFiles.add(file);
    } else {
        File spmrlFile = new File(spmrlPath);
        inFiles.add(spmrlFile);
    }
    if (posTagSetPath != null) {
        File posTagSetFile = new File(posTagSetPath);
        try (Scanner posTagSetScanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(posTagSetFile), "UTF-8")))) {
            posTagSet = new PosTagSet(posTagSetScanner);
        }
    }
    for (File inFile : inFiles) {
        try {
            File outDir = inFile.getParentFile();
            if (outDirPath != null) {
                outDir = new File(outDirPath);
                outDir.mkdirs();
            }
            String fileName = inFile.getName().substring(0, inFile.getName().length() - inSuffix.length()) + "." + suffix;
            Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outDir, fileName)), "UTF-8"));
            Scanner scanner = new Scanner(new BufferedReader(new InputStreamReader(new FileInputStream(inFile), "UTF-8")));
            List<ConllLine> lines = new ArrayList<SpmrlConverter.ConllLine>();
            int lineNumber = 0;
            int newLineNumber = 0;
            Map<String, Integer> compoundPatternCounts = new TreeMap<String, Integer>();
            int nonProjectiveCount = 0;
            boolean errorOnNonProjective = false;
            while (scanner.hasNextLine()) {
                String line = scanner.nextLine();
                lineNumber++;
                LOG.trace(lineNumber + ": " + line);
                if (line.trim().length() == 0) {
                    List<TokenCluster> tokens = new ArrayList<TokenCluster>();
                    boolean inCluster = false;
                    ConllLine lastLine = null;
                    for (ConllLine conllLine : lines) {
                        TokenCluster tokenCluster = new TokenCluster();
                        if (!conllLine.label.equals("dep_cpd")) {
                            if (inCluster) {
                                tokenCluster = tokens.get(tokens.size() - 1);
                                inCluster = false;
                            } else {
                                tokens.add(tokenCluster);
                            }
                        } else {
                            if (conllLine.index < conllLine.governor) {
                                // forward looking cluster
                                if (lastLine != null && lastLine.compPosTag != null) {
                                    tokenCluster = tokens.get(tokens.size() - 1);
                                } else if (inCluster) {
                                    tokenCluster = tokens.get(tokens.size() - 1);
                                } else {
                                    inCluster = true;
                                    tokens.add(tokenCluster);
                                }
                            } else if (tokens.size() > 0) {
                                tokenCluster = tokens.get(tokens.size() - 1);
                            } else {
                                tokens.add(tokenCluster);
                            }
                        }
                        tokenCluster.add(conllLine);
                        lastLine = conllLine;
                    }
                    List<TokenCluster> newTokens = new ArrayList<TokenCluster>();
                    for (TokenCluster tokenCluster : tokens) {
                        if (tokenCluster.size() > 1) {
                            boolean split = false;
                            String posTags = "";
                            String word = "";
                            for (ConllLine conllLine : tokenCluster) {
                                posTags += conllLine.posTag2 + "|";
                                word += conllLine.word + " ";
                            }
                            Integer countObj = compoundPatternCounts.get(posTags);
                            int count = countObj == null ? 0 : countObj.intValue();
                            count++;
                            compoundPatternCounts.put(posTags, count);
                            if (convertCompounds) {
                                split = true;
                                if (posTags.equals("NC|ADJ|")) {
                                    tokenCluster.head = 0;
                                    tokenCluster.get(1).governor = tokenCluster.get(0).index;
                                    tokenCluster.get(1).label = "mod";
                                    tokenCluster.get(1).copyGovernor();
                                } else if (posTags.equals("NC|NC|")) {
                                    tokenCluster.head = 0;
                                    tokenCluster.get(1).governor = tokenCluster.get(0).index;
                                    tokenCluster.get(1).label = "mod";
                                    tokenCluster.get(1).copyGovernor();
                                } else if (posTags.equals("NC|ADJ|ADJ|")) {
                                    tokenCluster.head = 0;
                                    tokenCluster.get(1).governor = tokenCluster.get(0).index;
                                    tokenCluster.get(1).label = "mod";
                                    tokenCluster.get(1).copyGovernor();
                                    tokenCluster.get(2).governor = tokenCluster.get(0).index;
                                    tokenCluster.get(2).label = "mod";
                                    tokenCluster.get(2).copyGovernor();
                                } else if (posTags.equals("NC|P|NC|") || posTags.equals("NC|P+D|NC|") || posTags.equals("NC|P|NPP|") || posTags.equals("NC|P+D|NPP|")) {
                                    tokenCluster.head = 0;
                                    tokenCluster.get(1).governor = tokenCluster.get(0).index;
                                    tokenCluster.get(1).label = "dep";
                                    tokenCluster.get(1).copyGovernor();
                                    tokenCluster.get(2).governor = tokenCluster.get(1).index;
                                    tokenCluster.get(2).label = "prep";
                                    tokenCluster.get(2).copyGovernor();
                                } else if (posTags.equals("NC|P|DET|NC|")) {
                                    tokenCluster.head = 0;
                                    tokenCluster.get(1).governor = tokenCluster.get(0).index;
                                    tokenCluster.get(1).label = "dep";
                                    tokenCluster.get(1).copyGovernor();
                                    tokenCluster.get(2).governor = tokenCluster.get(3).index;
                                    tokenCluster.get(2).label = "det";
                                    tokenCluster.get(2).copyGovernor();
                                    tokenCluster.get(3).governor = tokenCluster.get(1).index;
                                    tokenCluster.get(3).label = "prep";
                                    tokenCluster.get(3).copyGovernor();
                                } else if (posTags.equals("NC|P|NC|ADJ|") || posTags.equals("NC|P+D|NC|ADJ|")) {
                                    tokenCluster.head = 0;
                                    tokenCluster.get(1).governor = tokenCluster.get(0).index;
                                    tokenCluster.get(1).label = "dep";
                                    tokenCluster.get(1).copyGovernor();
                                    tokenCluster.get(2).governor = tokenCluster.get(1).index;
                                    tokenCluster.get(2).label = "prep";
                                    tokenCluster.get(2).copyGovernor();
                                    tokenCluster.get(3).governor = tokenCluster.get(2).index;
                                    tokenCluster.get(3).label = "mod";
                                    tokenCluster.get(3).copyGovernor();
                                } else if (posTags.equals("NC|ADJ|P|NC|") || posTags.equals("NC|ADJ|P+D|NC|")) {
                                    tokenCluster.head = 0;
                                    tokenCluster.get(1).governor = tokenCluster.get(0).index;
                                    tokenCluster.get(1).label = "mod";
                                    tokenCluster.get(1).copyGovernor();
                                    tokenCluster.get(2).governor = tokenCluster.get(0).index;
                                    tokenCluster.get(2).label = "dep";
                                    tokenCluster.get(2).copyGovernor();
                                    tokenCluster.get(3).governor = tokenCluster.get(2).index;
                                    tokenCluster.get(3).label = "prep";
                                    tokenCluster.get(3).copyGovernor();
                                } else if (posTags.equals("ADJ|NC|")) {
                                    tokenCluster.head = 1;
                                    if (tokenCluster.get(1).governor == tokenCluster.get(0).index) {
                                        tokenCluster.get(1).governor = tokenCluster.get(0).governor;
                                        tokenCluster.get(1).label = tokenCluster.get(0).label;
                                        tokenCluster.get(1).projGov = tokenCluster.get(0).projGov;
                                        tokenCluster.get(1).projLabel = tokenCluster.get(0).projLabel;
                                    }
                                    tokenCluster.get(0).governor = tokenCluster.get(1).index;
                                    tokenCluster.get(0).label = "mod";
                                    tokenCluster.get(0).copyGovernor();
                                } else {
                                    if (posTags.equals("DET|PONCT|DET|") || posTags.equals("DET|DET|")) {
                                    // do nothing
                                    } else {
                                        LOG.debug(posTags + ": " + word);
                                    }
                                    split = false;
                                }
                                if (split) {
                                    for (ConllLine conllLine : tokenCluster) {
                                        conllLine.removeMweHead();
                                    }
                                }
                            }
                            if (!compressCompounds)
                                split = true;
                            if (split) {
                                if (tokenCluster.head != 0) {
                                    int oldIndex = tokenCluster.get(0).index;
                                    int newIndex = tokenCluster.get(tokenCluster.head).index;
                                    for (ConllLine conllLine : lines) {
                                        if (conllLine.governor == oldIndex) {
                                            conllLine.governor = newIndex;
                                        }
                                        if (conllLine.projGov == oldIndex) {
                                            conllLine.projGov = newIndex;
                                        }
                                    }
                                }
                                for (ConllLine conllLine : tokenCluster) {
                                    TokenCluster newCluster = new TokenCluster();
                                    newCluster.add(conllLine);
                                    newTokens.add(newCluster);
                                }
                            } else {
                                String compPosTag = null;
                                for (ConllLine conllLine : tokenCluster) {
                                    if (conllLine.compPosTag != null) {
                                        compPosTag = conllLine.compPosTag;
                                        break;
                                    }
                                }
                                if (compPosTag == null) {
                                    throw new RuntimeException("Didn't find compPosTag on line: " + tokenCluster.get(0).lineNumber);
                                }
                                ConllLine head = null;
                                for (ConllLine conllLine : tokenCluster) {
                                    if (!conllLine.label.equals("dep_cpd")) {
                                        head = conllLine;
                                        break;
                                    }
                                }
                                if (head == null) {
                                    throw new RuntimeException("Didn't find head on line: " + tokenCluster.get(0).lineNumber);
                                }
                                tokenCluster.get(0).posTag2 = compPosTag;
                                tokenCluster.get(0).posTag = compPosTag;
                                tokenCluster.get(0).governor = head.governor;
                                tokenCluster.get(0).label = head.label;
                                tokenCluster.get(0).projGov = head.projGov;
                                tokenCluster.get(0).projLabel = head.projLabel;
                                tokenCluster.get(0).removeMweHead();
                                if (compPosTag.equals("NC") || compPosTag.equals("NPP")) {
                                    tokenCluster.get(0).posTag = "N";
                                } else if (compPosTag.startsWith("V")) {
                                    tokenCluster.get(0).posTag = "V";
                                } else if (compPosTag.startsWith("PRO")) {
                                    tokenCluster.get(0).posTag = "PRO";
                                } else if (compPosTag.startsWith("ADJ")) {
                                    tokenCluster.get(0).posTag = "A";
                                } else if (compPosTag.startsWith("DET")) {
                                    tokenCluster.get(0).posTag = "D";
                                } else if (compPosTag.startsWith("CL")) {
                                    tokenCluster.get(0).posTag = "CL";
                                } else if (compPosTag.startsWith("C")) {
                                    tokenCluster.get(0).posTag = "C";
                                }
                                newTokens.add(tokenCluster);
                            }
                        } else {
                            newTokens.add(tokenCluster);
                        }
                    // multi-token cluster?
                    }
                    tokens = newTokens;
                    int currentIndex = 1;
                    Map<Integer, Integer> indexMap = new HashMap<Integer, Integer>();
                    indexMap.put(0, 0);
                    for (TokenCluster tokenCluster : tokens) {
                        tokenCluster.newIndex = currentIndex++;
                        for (ConllLine conllLine : tokenCluster) {
                            indexMap.put(conllLine.index, tokenCluster.newIndex);
                        }
                        tokenCluster.word = tokenCluster.get(0).word;
                        tokenCluster.lemma = tokenCluster.get(0).lemma;
                        for (int i = 1; i < tokenCluster.size(); i++) {
                            ConllLine conllLine = tokenCluster.get(i);
                            if (tokenCluster.word.length() == 0 || tokenCluster.word.endsWith("'") || tokenCluster.word.endsWith("-") || tokenCluster.word.endsWith(",") || conllLine.word.startsWith("-") || conllLine.word.equals(",")) {
                                tokenCluster.word += conllLine.word;
                            } else {
                                tokenCluster.word += "_" + conllLine.word;
                            }
                        }
                        if (tokenCluster.size() > 1) {
                            tokenCluster.lemma = tokenCluster.word;
                            if (Character.isUpperCase(tokenCluster.lemma.charAt(0))) {
                                if (!Character.isUpperCase(tokenCluster.get(0).lemma.charAt(0))) {
                                    tokenCluster.lemma = tokenCluster.get(0).lemma.charAt(0) + tokenCluster.lemma.substring(1);
                                }
                            }
                        }
                    }
                    List<ConllLine> newLines = new ArrayList<SpmrlConverter.ConllLine>();
                    for (TokenCluster tokenCluster : tokens) {
                        ConllLine conllLine = tokenCluster.get(0);
                        if (conllLine.posTag2 == null || conllLine.posTag2.equals("null") || conllLine.posTag2.equals("UNK")) {
                            throw new RuntimeException("Bad postag on line: " + lineNumber + ": " + conllLine);
                        }
                        newLineNumber++;
                        String newLine = tokenCluster.newIndex + "\t" + tokenCluster.word + "\t" + tokenCluster.lemma + "\t" + conllLine.posTag + "\t" + conllLine.posTag2 + "\t" + conllLine.morph + "\t" + indexMap.get(conllLine.governor) + "\t" + conllLine.label + "\t" + indexMap.get(conllLine.projGov) + "\t" + conllLine.projLabel;
                        ConllLine newConllLine = new ConllLine(newLine, lineNumber, newLineNumber);
                        newLines.add(newConllLine);
                    }
                    for (ConllLine conllLine : newLines) {
                        if (conllLine.word.toLowerCase().equals("car") && conllLine.posTag2.equals("CC")) {
                            conllLine.posTag2 = "CS";
                            conllLine.morph = "s=s";
                            if (conllLine.label.equals("coord")) {
                                conllLine.label = "mod";
                            }
                            if (conllLine.projLabel.equals("coord")) {
                                conllLine.projLabel = "mod";
                            }
                            for (ConllLine otherLine : newLines) {
                                if (otherLine.governor == conllLine.index && otherLine.label.equals("dep_coord")) {
                                    otherLine.label = "sub";
                                }
                                if (otherLine.projGov == conllLine.index && otherLine.projLabel.equals("dep_coord")) {
                                    otherLine.projLabel = "sub";
                                }
                            }
                        }
                    }
                    int i = 0;
                    boolean hasNonProjective = false;
                    for (ConllLine conllLine : newLines) {
                        i++;
                        int headIndex = conllLine.projGov;
                        int depIndex = conllLine.index;
                        int startIndex = headIndex < depIndex ? headIndex : depIndex;
                        int endIndex = headIndex >= depIndex ? headIndex : depIndex;
                        int j = 0;
                        for (ConllLine otherLine : newLines) {
                            j++;
                            if (j <= i)
                                continue;
                            int headIndex2 = otherLine.projGov;
                            int depIndex2 = otherLine.index;
                            int startIndex2 = headIndex2 < depIndex2 ? headIndex2 : depIndex2;
                            int endIndex2 = headIndex2 >= depIndex2 ? headIndex2 : depIndex2;
                            boolean nonProjective = false;
                            if (startIndex2 < startIndex && endIndex2 > startIndex && endIndex2 < endIndex) {
                                nonProjective = true;
                            } else if (startIndex2 > startIndex && startIndex2 < endIndex && endIndex2 > endIndex) {
                                nonProjective = true;
                            }
                            if (nonProjective) {
                                LOG.error("Non-projective arcs at line: " + lineNumber);
                                LOG.error(conllLine.lineNumber + ": " + conllLine.toString());
                                LOG.error(otherLine.lineNumber + ": " + otherLine.toString());
                                hasNonProjective = true;
                                nonProjectiveCount++;
                            }
                        }
                    }
                    for (ConllLine conllLine : newLines) {
                        writer.write(conllLine.toString() + "\n");
                    }
                    newLineNumber++;
                    writer.write("\n");
                    writer.flush();
                    if (errorOnNonProjective && hasNonProjective)
                        throw new RuntimeException("Found non projective arc");
                    lines = new ArrayList<SpmrlConverter.ConllLine>();
                } else {
                    ConllLine conllLine = new ConllLine(line, lineNumber, lineNumber);
                    lines.add(conllLine);
                }
            }
            scanner.close();
            writer.close();
            Set<WeightedOutcome<String>> counts = new TreeSet<WeightedOutcome<String>>();
            for (String posTags : compoundPatternCounts.keySet()) {
                counts.add(new WeightedOutcome<String>(posTags, compoundPatternCounts.get(posTags)));
            }
            for (WeightedOutcome<String> count : counts) {
                LOG.info(count.getOutcome() + ": " + count.getWeight());
            }
            LOG.info("non projective count: " + nonProjectiveCount);
        } catch (Exception e) {
            LogUtils.logError(LOG, e);
        }
    }
}
Also used : Scanner(java.util.Scanner) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) BufferedWriter(java.io.BufferedWriter) FilenameFilter(java.io.FilenameFilter) TreeSet(java.util.TreeSet) InputStreamReader(java.io.InputStreamReader) WeightedOutcome(com.joliciel.talismane.utils.WeightedOutcome) TreeMap(java.util.TreeMap) FileInputStream(java.io.FileInputStream) TalismaneException(com.joliciel.talismane.TalismaneException) UnknownPosTagException(com.joliciel.talismane.posTagger.UnknownPosTagException) PosTagSet(com.joliciel.talismane.posTagger.PosTagSet) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File) OutputStreamWriter(java.io.OutputStreamWriter) BufferedWriter(java.io.BufferedWriter) Writer(java.io.Writer)

Example 4 with PosTagSet

use of com.joliciel.talismane.posTagger.PosTagSet in project talismane by joliciel-informatique.

the class LexiconReaderTest method testReadLexicons.

@Test
public void testReadLexicons() throws Exception {
    System.setProperty("config.file", "src/test/resources/testWithLex.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    PosTaggerLexicon lexicon = TalismaneSession.get(sessionId).getMergedLexicon();
    List<LexicalEntry> entries = lexicon.getEntries("dame");
    for (LexicalEntry entry : entries) {
        System.out.println(entry);
    }
    assertEquals(9, entries.size());
    PosTagSet posTagSet = TalismaneSession.get(sessionId).getPosTagSet();
    entries = lexicon.findLexicalEntries("dame", posTagSet.getPosTag("NC"));
    for (LexicalEntry entry : entries) {
        System.out.println(entry);
    }
    assertEquals(2, entries.size());
    Set<PosTag> posTags = lexicon.findPossiblePosTags("dame");
    System.out.println(posTags);
    assertEquals(4, posTags.size());
    System.clearProperty("config.file");
    ConfigFactory.invalidateCaches();
}
Also used : PosTagSet(com.joliciel.talismane.posTagger.PosTagSet) PosTag(com.joliciel.talismane.posTagger.PosTag) Config(com.typesafe.config.Config) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Aggregations

PosTagSet (com.joliciel.talismane.posTagger.PosTagSet)4 PosTag (com.joliciel.talismane.posTagger.PosTag)3 WeightedOutcome (com.joliciel.talismane.utils.WeightedOutcome)2 FileOutputStream (java.io.FileOutputStream)2 ArrayList (java.util.ArrayList)2 TalismaneException (com.joliciel.talismane.TalismaneException)1 TalismaneTest (com.joliciel.talismane.TalismaneTest)1 UnknownPosTagException (com.joliciel.talismane.posTagger.UnknownPosTagException)1 Config (com.typesafe.config.Config)1 BufferedReader (java.io.BufferedReader)1 BufferedWriter (java.io.BufferedWriter)1 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 FilenameFilter (java.io.FilenameFilter)1 InputStreamReader (java.io.InputStreamReader)1 ObjectOutputStream (java.io.ObjectOutputStream)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Writer (java.io.Writer)1 HashMap (java.util.HashMap)1 Scanner (java.util.Scanner)1