use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class MutualInformationEntropyPhraseExtractor method extractPhrase.
@Override
public List<String> extractPhrase(String text, int size) {
List<String> phraseList = new LinkedList<String>();
Occurrence occurrence = new Occurrence();
Filter[] filterChain = new Filter[] { CoreStopWordDictionary.FILTER, new Filter() {
@Override
public boolean shouldInclude(Term term) {
switch(term.nature) {
case t:
case nx:
return false;
}
return true;
}
} };
for (List<Term> sentence : NotionalTokenizer.seg2sentence(text, filterChain)) {
if (HanLP.Config.DEBUG) {
System.out.println(sentence);
}
occurrence.addAll(sentence);
}
occurrence.compute();
if (HanLP.Config.DEBUG) {
System.out.println(occurrence);
for (PairFrequency phrase : occurrence.getPhraseByMi()) {
System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tmi=" + phrase.mi + " , ");
}
System.out.println();
for (PairFrequency phrase : occurrence.getPhraseByLe()) {
System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tle=" + phrase.le + " , ");
}
System.out.println();
for (PairFrequency phrase : occurrence.getPhraseByRe()) {
System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tre=" + phrase.re + " , ");
}
System.out.println();
for (PairFrequency phrase : occurrence.getPhraseByScore()) {
System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tscore=" + phrase.score + " , ");
}
System.out.println();
}
for (PairFrequency phrase : occurrence.getPhraseByScore()) {
if (phraseList.size() == size)
break;
phraseList.add(phrase.first + phrase.second);
}
return phraseList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class AhoCorasickDoubleArrayTrieSegment method segSentence.
@Override
protected List<Term> segSentence(char[] sentence) {
if (trie == null) {
logger.warning("还未加载任何词典");
return Collections.emptyList();
}
final int[] wordNet = new int[sentence.length];
Arrays.fill(wordNet, 1);
final Nature[] natureArray = config.speechTagging ? new Nature[sentence.length] : null;
trie.parseText(sentence, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {
@Override
public void hit(int begin, int end, CoreDictionary.Attribute value) {
int length = end - begin;
if (length > wordNet[begin]) {
wordNet[begin] = length;
if (config.speechTagging) {
natureArray[begin] = value.nature[0];
}
}
}
});
LinkedList<Term> termList = new LinkedList<Term>();
if (config.speechTagging) {
for (int i = 0; i < natureArray.length; ) {
if (natureArray[i] == null) {
int j = i + 1;
for (; j < natureArray.length; ++j) {
if (natureArray[j] != null)
break;
}
List<AtomNode> atomNodeList = quickAtomSegment(sentence, i, j);
for (AtomNode atomNode : atomNodeList) {
if (atomNode.sWord.length() >= wordNet[i]) {
wordNet[i] = atomNode.sWord.length();
natureArray[i] = atomNode.getNature();
i += wordNet[i];
}
}
i = j;
} else {
++i;
}
}
}
for (int i = 0; i < wordNet.length; ) {
Term term = new Term(new String(sentence, i, wordNet[i]), config.speechTagging ? (natureArray[i] == null ? Nature.nz : natureArray[i]) : null);
term.offset = i;
termList.add(term);
i += wordNet[i];
}
return termList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class Segment method seg.
/**
* 分词<br>
* 此方法是线程安全的
*
* @param text 待分词文本
* @return 单词列表
*/
public List<Term> seg(String text) {
char[] charArray = text.toCharArray();
if (HanLP.Config.Normalization) {
CharTable.normalization(charArray);
}
if (// 小文本多线程没意义,反而变慢了
config.threadNumber > 1 && charArray.length > 10000) {
List<String> sentenceList = SentencesUtil.toSentenceList(charArray);
String[] sentenceArray = new String[sentenceList.size()];
sentenceList.toArray(sentenceArray);
//noinspection unchecked
List<Term>[] termListArray = new List[sentenceArray.length];
final int per = sentenceArray.length / config.threadNumber;
WorkThread[] threadArray = new WorkThread[config.threadNumber];
for (int i = 0; i < config.threadNumber - 1; ++i) {
int from = i * per;
threadArray[i] = new WorkThread(sentenceArray, termListArray, from, from + per);
threadArray[i].start();
}
threadArray[config.threadNumber - 1] = new WorkThread(sentenceArray, termListArray, (config.threadNumber - 1) * per, sentenceArray.length);
threadArray[config.threadNumber - 1].start();
try {
for (WorkThread thread : threadArray) {
thread.join();
}
} catch (InterruptedException e) {
logger.severe("线程同步异常:" + TextUtility.exceptionToString(e));
return Collections.emptyList();
}
List<Term> termList = new LinkedList<Term>();
if (// 由于分割了句子,所以需要重新校正offset
config.offset || config.indexMode) {
int sentenceOffset = 0;
for (int i = 0; i < sentenceArray.length; ++i) {
for (Term term : termListArray[i]) {
term.offset += sentenceOffset;
termList.add(term);
}
sentenceOffset += sentenceArray[i].length();
}
} else {
for (List<Term> list : termListArray) {
termList.addAll(list);
}
}
return termList;
}
// }
return segSentence(charArray);
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class TextRankSentence method convertSentenceListToDocument.
/**
* 将句子列表转化为文档
*
* @param sentenceList
* @return
*/
private static List<List<String>> convertSentenceListToDocument(List<String> sentenceList) {
List<List<String>> docs = new ArrayList<List<String>>(sentenceList.size());
for (String sentence : sentenceList) {
List<Term> termList = StandardTokenizer.segment(sentence.toCharArray());
List<String> wordList = new LinkedList<String>();
for (Term term : termList) {
if (CoreStopWordDictionary.shouldInclude(term)) {
wordList.add(term.word);
}
}
docs.add(wordList);
}
return docs;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class TraditionalChineseTokenizer method segSentence.
private static List<Term> segSentence(String text) {
String sText = CharTable.convert(text);
List<Term> termList = SEGMENT.seg(sText);
int offset = 0;
for (Term term : termList) {
String tText;
term.offset = offset;
if (term.length() == 1 || (tText = SimplifiedChineseDictionary.getTraditionalChinese(term.word)) == null) {
term.word = text.substring(offset, offset + term.length());
offset += term.length();
} else {
offset += term.length();
term.word = tText;
}
}
return termList;
}
Aggregations