Search in sources :

Example 6 with CoNLLSentence

use of com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence in project HanLP by hankcs.

the class TestDependencyCorpus method testMakeCRF.

/**
     * 导出CRF训练语料
     *
     * @throws Exception
     */
public void testMakeCRF() throws Exception {
    BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("D:\\Tools\\CRF++-0.58\\example\\dependency\\dev.txt")));
    LinkedList<CoNLLSentence> coNLLSentences = CoNLLLoader.loadSentenceList("D:\\Doc\\语料库\\依存分析训练数据\\THU\\dev.conll.fixed.txt");
    for (CoNLLSentence coNLLSentence : coNLLSentences) {
        for (CoNLLWord coNLLWord : coNLLSentence.word) {
            bw.write(coNLLWord.NAME);
            bw.write('\t');
            bw.write(coNLLWord.CPOSTAG);
            bw.write('\t');
            bw.write(coNLLWord.POSTAG);
            bw.write('\t');
            int d = coNLLWord.HEAD.ID - coNLLWord.ID;
            int posDistance = 1;
            if (// 在后面
            d > 0) {
                for (int i = 1; i < d; ++i) {
                    if (coNLLSentence.word[coNLLWord.ID - 1 + i].CPOSTAG.equals(coNLLWord.HEAD.CPOSTAG)) {
                        ++posDistance;
                    }
                }
            } else {
                for (// 在前面
                int i = 1; // 在前面
                i < -d; // 在前面
                ++i) {
                    if (coNLLSentence.word[coNLLWord.ID - 1 - i].CPOSTAG.equals(coNLLWord.HEAD.CPOSTAG)) {
                        ++posDistance;
                    }
                }
            }
            bw.write((d > 0 ? "+" : "-") + posDistance + "_" + coNLLWord.HEAD.CPOSTAG);
            bw.newLine();
        }
        bw.newLine();
    }
    bw.close();
}
Also used : CoNLLWord(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord) FileOutputStream(java.io.FileOutputStream) CoNLLSentence(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence) OutputStreamWriter(java.io.OutputStreamWriter) BufferedWriter(java.io.BufferedWriter)

Example 7 with CoNLLSentence

use of com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence in project HanLP by hankcs.

the class TestDependencyCorpus method testPosTag.

/**
     * 细粒度转粗粒度
     *
     * @throws Exception
     */
public void testPosTag() throws Exception {
    DictionaryMaker dictionaryMaker = new DictionaryMaker();
    LinkedList<CoNLLSentence> coNLLSentences = CoNLLLoader.loadSentenceList("D:\\Doc\\语料库\\依存分析训练数据\\THU\\dev.conll.fixed.txt");
    for (CoNLLSentence coNLLSentence : coNLLSentences) {
        for (CoNLLWord coNLLWord : coNLLSentence.word) {
            dictionaryMaker.add(new Item(coNLLWord.POSTAG, coNLLWord.CPOSTAG));
        }
    }
    System.out.println(dictionaryMaker.entrySet());
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item) CoNLLWord(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord) CoNLLSentence(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)

Example 8 with CoNLLSentence

use of com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence in project HanLP by hankcs.

the class MinimumSpanningTreeParser method parse.

@Override
public CoNLLSentence parse(List<Term> termList) {
    if (termList == null || termList.size() == 0)
        return null;
    termList.add(0, new Term("##核心##", Nature.begin));
    Node[] nodeArray = new Node[termList.size()];
    Iterator<Term> iterator = termList.iterator();
    for (int i = 0; i < nodeArray.length; ++i) {
        nodeArray[i] = new Node(iterator.next(), i);
    }
    Edge[][] edges = new Edge[nodeArray.length][nodeArray.length];
    for (int i = 0; i < edges.length; ++i) {
        for (int j = 0; j < edges[i].length; ++j) {
            if (i != j) {
                edges[j][i] = makeEdge(nodeArray, i, j);
            }
        }
    }
    // 最小生成树Prim算法
    int max_v = nodeArray.length * (nodeArray.length - 1);
    float[] mincost = new float[max_v];
    Arrays.fill(mincost, Float.MAX_VALUE / 3);
    boolean[] used = new boolean[max_v];
    Arrays.fill(used, false);
    used[0] = true;
    PriorityQueue<State> que = new PriorityQueue<State>();
    // 找虚根的唯一孩子
    float minCostToRoot = Float.MAX_VALUE;
    Edge firstEdge = null;
    Edge[] edgeResult = new Edge[termList.size() - 1];
    for (Edge edge : edges[0]) {
        if (edge == null)
            continue;
        if (minCostToRoot > edge.cost) {
            firstEdge = edge;
            minCostToRoot = edge.cost;
        }
    }
    if (firstEdge == null)
        return null;
    que.add(new State(minCostToRoot, firstEdge.from, firstEdge));
    while (!que.isEmpty()) {
        State p = que.poll();
        int v = p.id;
        if (used[v] || p.cost > mincost[v])
            continue;
        used[v] = true;
        if (p.edge != null) {
            //                System.out.println(p.edge.from + " " + p.edge.to + p.edge.label);
            edgeResult[p.edge.from - 1] = p.edge;
        }
        for (Edge e : edges[v]) {
            if (e == null)
                continue;
            if (mincost[e.from] > e.cost) {
                mincost[e.from] = e.cost;
                que.add(new State(mincost[e.from], e.from, e));
            }
        }
    }
    CoNLLWord[] wordArray = new CoNLLWord[termList.size() - 1];
    for (int i = 0; i < wordArray.length; ++i) {
        wordArray[i] = new CoNLLWord(i + 1, nodeArray[i + 1].word, nodeArray[i + 1].label);
        wordArray[i].DEPREL = edgeResult[i].label;
    }
    for (int i = 0; i < edgeResult.length; ++i) {
        int index = edgeResult[i].to - 1;
        if (index < 0) {
            wordArray[i].HEAD = CoNLLWord.ROOT;
            continue;
        }
        wordArray[i].HEAD = wordArray[index];
    }
    return new CoNLLSentence(wordArray);
}
Also used : Node(com.hankcs.hanlp.dependency.common.Node) Term(com.hankcs.hanlp.seg.common.Term) PriorityQueue(java.util.PriorityQueue) State(com.hankcs.hanlp.dependency.common.State) CoNLLWord(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord) CoNLLSentence(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence) Edge(com.hankcs.hanlp.dependency.common.Edge)

Example 9 with CoNLLSentence

use of com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence in project HanLP by hankcs.

the class MaxEntDependencyModelMaker method makeModel.

public static boolean makeModel(String corpusLoadPath, String modelSavePath) throws IOException {
    BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(IOUtil.newOutputStream(modelSavePath)));
    LinkedList<CoNLLSentence> sentenceList = CoNLLLoader.loadSentenceList(corpusLoadPath);
    int id = 1;
    for (CoNLLSentence sentence : sentenceList) {
        System.out.printf("%d / %d...", id++, sentenceList.size());
        String[][] edgeArray = sentence.getEdgeArray();
        CoNLLWord[] word = sentence.getWordArrayWithRoot();
        for (int i = 0; i < word.length; ++i) {
            for (int j = 0; j < word.length; ++j) {
                if (i == j)
                    continue;
                // 这就是一个边的实例,从i出发,到j,当然它可能存在也可能不存在,不存在取null照样是一个实例
                List<String> contextList = new LinkedList<String>();
                // 先生成i和j的原子特征
                contextList.addAll(generateSingleWordContext(word, i, "i"));
                contextList.addAll(generateSingleWordContext(word, j, "j"));
                // 然后生成二元组的特征
                contextList.addAll(generateUniContext(word, i, j));
                // 将特征字符串化
                for (String f : contextList) {
                    bw.write(f);
                    bw.write(' ');
                }
                // 事件名称为依存关系
                bw.write("" + edgeArray[i][j]);
                bw.newLine();
            }
        }
        System.out.println("done.");
    }
    bw.close();
    return true;
}
Also used : CoNLLWord(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord) CoNLLSentence(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence)

Example 10 with CoNLLSentence

use of com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence in project HanLP by hankcs.

the class WordNatureWeightModelMaker method makeModel.

public static boolean makeModel(String corpusLoadPath, String modelSavePath) {
    Set<String> posSet = new TreeSet<String>();
    DictionaryMaker dictionaryMaker = new DictionaryMaker();
    for (CoNLLSentence sentence : CoNLLLoader.loadSentenceList(corpusLoadPath)) {
        for (CoNLLWord word : sentence.word) {
            addPair(word.NAME, word.HEAD.NAME, word.DEPREL, dictionaryMaker);
            addPair(word.NAME, wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
            addPair(wrapTag(word.POSTAG), word.HEAD.NAME, word.DEPREL, dictionaryMaker);
            addPair(wrapTag(word.POSTAG), wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
            posSet.add(word.POSTAG);
        }
    }
    for (CoNLLSentence sentence : CoNLLLoader.loadSentenceList(corpusLoadPath)) {
        for (CoNLLWord word : sentence.word) {
            addPair(word.NAME, word.HEAD.NAME, word.DEPREL, dictionaryMaker);
            addPair(word.NAME, wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
            addPair(wrapTag(word.POSTAG), word.HEAD.NAME, word.DEPREL, dictionaryMaker);
            addPair(wrapTag(word.POSTAG), wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
            posSet.add(word.POSTAG);
        }
    }
    StringBuilder sb = new StringBuilder();
    for (String pos : posSet) {
        sb.append("case \"" + pos + "\":\n");
    }
    IOUtil.saveTxt("data/model/dependency/pos-thu.txt", sb.toString());
    return dictionaryMaker.saveTxtTo(modelSavePath);
}
Also used : TreeSet(java.util.TreeSet) CoNLLWord(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord) CoNLLSentence(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)

Aggregations

CoNLLSentence (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence)10 CoNLLWord (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord)10 Term (com.hankcs.hanlp.seg.common.Term)4 DictionaryMaker (com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)2 Evaluator (com.hankcs.hanlp.corpus.dependency.CoNll.Evaluator)1 Item (com.hankcs.hanlp.corpus.dictionary.item.Item)1 Edge (com.hankcs.hanlp.dependency.common.Edge)1 Node (com.hankcs.hanlp.dependency.common.Node)1 State (com.hankcs.hanlp.dependency.common.State)1 Table (com.hankcs.hanlp.model.crf.Table)1 BufferedWriter (java.io.BufferedWriter)1 FileOutputStream (java.io.FileOutputStream)1 OutputStreamWriter (java.io.OutputStreamWriter)1 ArrayList (java.util.ArrayList)1 LinkedList (java.util.LinkedList)1 PriorityQueue (java.util.PriorityQueue)1 TreeSet (java.util.TreeSet)1