use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class CRFSegment method toTermList.
/**
* 将一条路径转为最终结果
*
* @param vertexList
* @param offsetEnabled 是否计算offset
* @return
*/
protected static List<Term> toTermList(List<Vertex> vertexList, boolean offsetEnabled) {
assert vertexList != null;
int length = vertexList.size();
List<Term> resultList = new ArrayList<Term>(length);
Iterator<Vertex> iterator = vertexList.iterator();
if (offsetEnabled) {
int offset = 0;
for (int i = 0; i < length; ++i) {
Vertex vertex = iterator.next();
Term term = convert(vertex);
term.offset = offset;
offset += term.length();
resultList.add(term);
}
} else {
for (int i = 0; i < length; ++i) {
Vertex vertex = iterator.next();
Term term = convert(vertex);
resultList.add(term);
}
}
return resultList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class HMMSegment method segSentence.
@Override
protected List<Term> segSentence(char[] sentence) {
char[] tag = model.tag(sentence);
List<Term> termList = new LinkedList<Term>();
int offset = 0;
for (int i = 0; i < tag.length; offset += 1, ++i) {
switch(tag[i]) {
case 'b':
{
int begin = offset;
while (tag[i] != 'e') {
offset += 1;
++i;
if (i == tag.length) {
break;
}
}
if (i == tag.length) {
termList.add(new Term(new String(sentence, begin, offset - begin), null));
} else
termList.add(new Term(new String(sentence, begin, offset - begin + 1), null));
}
break;
default:
{
termList.add(new Term(new String(sentence, offset, 1), null));
}
break;
}
}
return termList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class MutualInformationEntropyPhraseExtractor method extractPhrase.
@Override
public List<String> extractPhrase(String text, int size) {
List<String> phraseList = new LinkedList<String>();
Occurrence occurrence = new Occurrence();
Filter[] filterChain = new Filter[] { CoreStopWordDictionary.FILTER, new Filter() {
@Override
public boolean shouldInclude(Term term) {
switch(term.nature) {
case t:
case nx:
return false;
}
return true;
}
} };
for (List<Term> sentence : NotionalTokenizer.seg2sentence(text, filterChain)) {
if (HanLP.Config.DEBUG) {
System.out.println(sentence);
}
occurrence.addAll(sentence);
}
occurrence.compute();
if (HanLP.Config.DEBUG) {
System.out.println(occurrence);
for (PairFrequency phrase : occurrence.getPhraseByMi()) {
System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tmi=" + phrase.mi + " , ");
}
System.out.println();
for (PairFrequency phrase : occurrence.getPhraseByLe()) {
System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tle=" + phrase.le + " , ");
}
System.out.println();
for (PairFrequency phrase : occurrence.getPhraseByRe()) {
System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tre=" + phrase.re + " , ");
}
System.out.println();
for (PairFrequency phrase : occurrence.getPhraseByScore()) {
System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tscore=" + phrase.score + " , ");
}
System.out.println();
}
for (PairFrequency phrase : occurrence.getPhraseByScore()) {
if (phraseList.size() == size)
break;
phraseList.add(phrase.first + phrase.second);
}
return phraseList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class MinimumSpanningTreeParser method parse.
@Override
public CoNLLSentence parse(List<Term> termList) {
if (termList == null || termList.size() == 0)
return null;
termList.add(0, new Term("##核心##", Nature.begin));
Node[] nodeArray = new Node[termList.size()];
Iterator<Term> iterator = termList.iterator();
for (int i = 0; i < nodeArray.length; ++i) {
nodeArray[i] = new Node(iterator.next(), i);
}
Edge[][] edges = new Edge[nodeArray.length][nodeArray.length];
for (int i = 0; i < edges.length; ++i) {
for (int j = 0; j < edges[i].length; ++j) {
if (i != j) {
edges[j][i] = makeEdge(nodeArray, i, j);
}
}
}
// 最小生成树Prim算法
int max_v = nodeArray.length * (nodeArray.length - 1);
float[] mincost = new float[max_v];
Arrays.fill(mincost, Float.MAX_VALUE / 3);
boolean[] used = new boolean[max_v];
Arrays.fill(used, false);
used[0] = true;
PriorityQueue<State> que = new PriorityQueue<State>();
// 找虚根的唯一孩子
float minCostToRoot = Float.MAX_VALUE;
Edge firstEdge = null;
Edge[] edgeResult = new Edge[termList.size() - 1];
for (Edge edge : edges[0]) {
if (edge == null)
continue;
if (minCostToRoot > edge.cost) {
firstEdge = edge;
minCostToRoot = edge.cost;
}
}
if (firstEdge == null)
return null;
que.add(new State(minCostToRoot, firstEdge.from, firstEdge));
while (!que.isEmpty()) {
State p = que.poll();
int v = p.id;
if (used[v] || p.cost > mincost[v])
continue;
used[v] = true;
if (p.edge != null) {
// System.out.println(p.edge.from + " " + p.edge.to + p.edge.label);
edgeResult[p.edge.from - 1] = p.edge;
}
for (Edge e : edges[v]) {
if (e == null)
continue;
if (mincost[e.from] > e.cost) {
mincost[e.from] = e.cost;
que.add(new State(mincost[e.from], e.from, e));
}
}
}
CoNLLWord[] wordArray = new CoNLLWord[termList.size() - 1];
for (int i = 0; i < wordArray.length; ++i) {
wordArray[i] = new CoNLLWord(i + 1, nodeArray[i + 1].word, nodeArray[i + 1].label);
wordArray[i].DEPREL = edgeResult[i].label;
}
for (int i = 0; i < edgeResult.length; ++i) {
int index = edgeResult[i].to - 1;
if (index < 0) {
wordArray[i].HEAD = CoNLLWord.ROOT;
continue;
}
wordArray[i].HEAD = wordArray[index];
}
return new CoNLLSentence(wordArray);
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class TextRankKeyword method getRank.
/**
* 使用已经分好的词来计算rank
* @param termList
* @return
*/
public Map<String, Float> getRank(List<Term> termList) {
List<String> wordList = new ArrayList<String>(termList.size());
for (Term t : termList) {
if (shouldInclude(t)) {
wordList.add(t.word);
}
}
// System.out.println(wordList);
Map<String, Set<String>> words = new TreeMap<String, Set<String>>();
Queue<String> que = new LinkedList<String>();
for (String w : wordList) {
if (!words.containsKey(w)) {
words.put(w, new TreeSet<String>());
}
que.offer(w);
if (que.size() > 5) {
que.poll();
}
for (String w1 : que) {
for (String w2 : que) {
if (w1.equals(w2)) {
continue;
}
words.get(w1).add(w2);
words.get(w2).add(w1);
}
}
}
// System.out.println(words);
Map<String, Float> score = new HashMap<String, Float>();
for (int i = 0; i < max_iter; ++i) {
Map<String, Float> m = new HashMap<String, Float>();
float max_diff = 0;
for (Map.Entry<String, Set<String>> entry : words.entrySet()) {
String key = entry.getKey();
Set<String> value = entry.getValue();
m.put(key, 1 - d);
for (String element : value) {
int size = words.get(element).size();
if (key.equals(element) || size == 0)
continue;
m.put(key, m.get(key) + d / size * (score.get(element) == null ? 0 : score.get(element)));
}
max_diff = Math.max(max_diff, Math.abs(m.get(key) - (score.get(key) == null ? 0 : score.get(key))));
}
score = m;
if (max_diff <= min_diff)
break;
}
return score;
}
Aggregations