use of com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence in project HanLP by hankcs.
the class TestDependencyCorpus method testMakeCRF.
/**
* 导出CRF训练语料
*
* @throws Exception
*/
public void testMakeCRF() throws Exception {
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("D:\\Tools\\CRF++-0.58\\example\\dependency\\dev.txt")));
LinkedList<CoNLLSentence> coNLLSentences = CoNLLLoader.loadSentenceList("D:\\Doc\\语料库\\依存分析训练数据\\THU\\dev.conll.fixed.txt");
for (CoNLLSentence coNLLSentence : coNLLSentences) {
for (CoNLLWord coNLLWord : coNLLSentence.word) {
bw.write(coNLLWord.NAME);
bw.write('\t');
bw.write(coNLLWord.CPOSTAG);
bw.write('\t');
bw.write(coNLLWord.POSTAG);
bw.write('\t');
int d = coNLLWord.HEAD.ID - coNLLWord.ID;
int posDistance = 1;
if (// 在后面
d > 0) {
for (int i = 1; i < d; ++i) {
if (coNLLSentence.word[coNLLWord.ID - 1 + i].CPOSTAG.equals(coNLLWord.HEAD.CPOSTAG)) {
++posDistance;
}
}
} else {
for (// 在前面
int i = 1; // 在前面
i < -d; // 在前面
++i) {
if (coNLLSentence.word[coNLLWord.ID - 1 - i].CPOSTAG.equals(coNLLWord.HEAD.CPOSTAG)) {
++posDistance;
}
}
}
bw.write((d > 0 ? "+" : "-") + posDistance + "_" + coNLLWord.HEAD.CPOSTAG);
bw.newLine();
}
bw.newLine();
}
bw.close();
}
use of com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence in project HanLP by hankcs.
the class TestDependencyCorpus method testPosTag.
/**
* 细粒度转粗粒度
*
* @throws Exception
*/
public void testPosTag() throws Exception {
DictionaryMaker dictionaryMaker = new DictionaryMaker();
LinkedList<CoNLLSentence> coNLLSentences = CoNLLLoader.loadSentenceList("D:\\Doc\\语料库\\依存分析训练数据\\THU\\dev.conll.fixed.txt");
for (CoNLLSentence coNLLSentence : coNLLSentences) {
for (CoNLLWord coNLLWord : coNLLSentence.word) {
dictionaryMaker.add(new Item(coNLLWord.POSTAG, coNLLWord.CPOSTAG));
}
}
System.out.println(dictionaryMaker.entrySet());
}
use of com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence in project HanLP by hankcs.
the class MinimumSpanningTreeParser method parse.
@Override
public CoNLLSentence parse(List<Term> termList) {
if (termList == null || termList.size() == 0)
return null;
termList.add(0, new Term("##核心##", Nature.begin));
Node[] nodeArray = new Node[termList.size()];
Iterator<Term> iterator = termList.iterator();
for (int i = 0; i < nodeArray.length; ++i) {
nodeArray[i] = new Node(iterator.next(), i);
}
Edge[][] edges = new Edge[nodeArray.length][nodeArray.length];
for (int i = 0; i < edges.length; ++i) {
for (int j = 0; j < edges[i].length; ++j) {
if (i != j) {
edges[j][i] = makeEdge(nodeArray, i, j);
}
}
}
// 最小生成树Prim算法
int max_v = nodeArray.length * (nodeArray.length - 1);
float[] mincost = new float[max_v];
Arrays.fill(mincost, Float.MAX_VALUE / 3);
boolean[] used = new boolean[max_v];
Arrays.fill(used, false);
used[0] = true;
PriorityQueue<State> que = new PriorityQueue<State>();
// 找虚根的唯一孩子
float minCostToRoot = Float.MAX_VALUE;
Edge firstEdge = null;
Edge[] edgeResult = new Edge[termList.size() - 1];
for (Edge edge : edges[0]) {
if (edge == null)
continue;
if (minCostToRoot > edge.cost) {
firstEdge = edge;
minCostToRoot = edge.cost;
}
}
if (firstEdge == null)
return null;
que.add(new State(minCostToRoot, firstEdge.from, firstEdge));
while (!que.isEmpty()) {
State p = que.poll();
int v = p.id;
if (used[v] || p.cost > mincost[v])
continue;
used[v] = true;
if (p.edge != null) {
// System.out.println(p.edge.from + " " + p.edge.to + p.edge.label);
edgeResult[p.edge.from - 1] = p.edge;
}
for (Edge e : edges[v]) {
if (e == null)
continue;
if (mincost[e.from] > e.cost) {
mincost[e.from] = e.cost;
que.add(new State(mincost[e.from], e.from, e));
}
}
}
CoNLLWord[] wordArray = new CoNLLWord[termList.size() - 1];
for (int i = 0; i < wordArray.length; ++i) {
wordArray[i] = new CoNLLWord(i + 1, nodeArray[i + 1].word, nodeArray[i + 1].label);
wordArray[i].DEPREL = edgeResult[i].label;
}
for (int i = 0; i < edgeResult.length; ++i) {
int index = edgeResult[i].to - 1;
if (index < 0) {
wordArray[i].HEAD = CoNLLWord.ROOT;
continue;
}
wordArray[i].HEAD = wordArray[index];
}
return new CoNLLSentence(wordArray);
}
use of com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence in project HanLP by hankcs.
the class MaxEntDependencyModelMaker method makeModel.
public static boolean makeModel(String corpusLoadPath, String modelSavePath) throws IOException {
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(IOUtil.newOutputStream(modelSavePath)));
LinkedList<CoNLLSentence> sentenceList = CoNLLLoader.loadSentenceList(corpusLoadPath);
int id = 1;
for (CoNLLSentence sentence : sentenceList) {
System.out.printf("%d / %d...", id++, sentenceList.size());
String[][] edgeArray = sentence.getEdgeArray();
CoNLLWord[] word = sentence.getWordArrayWithRoot();
for (int i = 0; i < word.length; ++i) {
for (int j = 0; j < word.length; ++j) {
if (i == j)
continue;
// 这就是一个边的实例,从i出发,到j,当然它可能存在也可能不存在,不存在取null照样是一个实例
List<String> contextList = new LinkedList<String>();
// 先生成i和j的原子特征
contextList.addAll(generateSingleWordContext(word, i, "i"));
contextList.addAll(generateSingleWordContext(word, j, "j"));
// 然后生成二元组的特征
contextList.addAll(generateUniContext(word, i, j));
// 将特征字符串化
for (String f : contextList) {
bw.write(f);
bw.write(' ');
}
// 事件名称为依存关系
bw.write("" + edgeArray[i][j]);
bw.newLine();
}
}
System.out.println("done.");
}
bw.close();
return true;
}
use of com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence in project HanLP by hankcs.
the class WordNatureWeightModelMaker method makeModel.
public static boolean makeModel(String corpusLoadPath, String modelSavePath) {
Set<String> posSet = new TreeSet<String>();
DictionaryMaker dictionaryMaker = new DictionaryMaker();
for (CoNLLSentence sentence : CoNLLLoader.loadSentenceList(corpusLoadPath)) {
for (CoNLLWord word : sentence.word) {
addPair(word.NAME, word.HEAD.NAME, word.DEPREL, dictionaryMaker);
addPair(word.NAME, wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
addPair(wrapTag(word.POSTAG), word.HEAD.NAME, word.DEPREL, dictionaryMaker);
addPair(wrapTag(word.POSTAG), wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
posSet.add(word.POSTAG);
}
}
for (CoNLLSentence sentence : CoNLLLoader.loadSentenceList(corpusLoadPath)) {
for (CoNLLWord word : sentence.word) {
addPair(word.NAME, word.HEAD.NAME, word.DEPREL, dictionaryMaker);
addPair(word.NAME, wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
addPair(wrapTag(word.POSTAG), word.HEAD.NAME, word.DEPREL, dictionaryMaker);
addPair(wrapTag(word.POSTAG), wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
posSet.add(word.POSTAG);
}
}
StringBuilder sb = new StringBuilder();
for (String pos : posSet) {
sb.append("case \"" + pos + "\":\n");
}
IOUtil.saveTxt("data/model/dependency/pos-thu.txt", sb.toString());
return dictionaryMaker.saveTxtTo(modelSavePath);
}
Aggregations