use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class TestSegment method testIssue193.
public void testIssue193() throws Exception {
String[] testCase = new String[] { "以每台约200元的价格送到苹果售后维修中心换新机(苹果的保修基本是免费换新机)", "可能以2500~2800元的价格回收", "3700个益农信息社打通服务“最后一公里”", "一位李先生给高政留言说上周五可以帮忙献血", "一位浩宁达高层透露", "五和万科长阳天地5个普宅项目", "以1974点低点和5178点高点作江恩角度线", "纳入统计的18家京系基金公司", "华夏基金与嘉实基金两家京系基金公司", "则应从排名第八的投标人开始依次递补三名投标人" };
Segment segment = HanLP.newSegment().enableOrganizationRecognize(true).enableNumberQuantifierRecognize(true);
for (String sentence : testCase) {
List<Term> termList = segment.seg(sentence);
System.out.println(termList);
}
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class TextRankKeyword method getRank.
/**
* 使用已经分好的词来计算rank
* @param termList
* @return
*/
public Map<String, Float> getRank(List<Term> termList) {
List<String> wordList = new ArrayList<String>(termList.size());
for (Term t : termList) {
if (shouldInclude(t)) {
wordList.add(t.word);
}
}
// System.out.println(wordList);
Map<String, Set<String>> words = new TreeMap<String, Set<String>>();
Queue<String> que = new LinkedList<String>();
for (String w : wordList) {
if (!words.containsKey(w)) {
words.put(w, new TreeSet<String>());
}
que.offer(w);
if (que.size() > 5) {
que.poll();
}
for (String w1 : que) {
for (String w2 : que) {
if (w1.equals(w2)) {
continue;
}
words.get(w1).add(w2);
words.get(w2).add(w1);
}
}
}
// System.out.println(words);
Map<String, Float> score = new HashMap<String, Float>();
for (int i = 0; i < max_iter; ++i) {
Map<String, Float> m = new HashMap<String, Float>();
float max_diff = 0;
for (Map.Entry<String, Set<String>> entry : words.entrySet()) {
String key = entry.getKey();
Set<String> value = entry.getValue();
m.put(key, 1 - d);
for (String element : value) {
int size = words.get(element).size();
if (key.equals(element) || size == 0)
continue;
m.put(key, m.get(key) + d / size * (score.get(element) == null ? 0 : score.get(element)));
}
max_diff = Math.max(max_diff, Math.abs(m.get(key) - (score.get(key) == null ? 0 : score.get(key))));
}
score = m;
if (max_diff <= min_diff)
break;
}
return score;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class DoubleArrayTrieSegment method segSentence.
@Override
protected List<Term> segSentence(char[] sentence) {
char[] charArray = sentence;
final int[] wordNet = new int[charArray.length];
Arrays.fill(wordNet, 1);
final Nature[] natureArray = config.speechTagging ? new Nature[charArray.length] : null;
DoubleArrayTrie<CoreDictionary.Attribute>.Searcher<CoreDictionary.Attribute> searcher = CoreDictionary.trie.getSearcher(sentence, 0);
while (searcher.next()) {
int length = searcher.length;
if (length > wordNet[searcher.begin]) {
wordNet[searcher.begin] = length;
if (config.speechTagging) {
natureArray[searcher.begin] = searcher.value.nature[0];
}
}
}
if (config.useCustomDictionary) {
CustomDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {
@Override
public void hit(int begin, int end, CoreDictionary.Attribute value) {
int length = end - begin;
if (length > wordNet[begin]) {
wordNet[begin] = length;
if (config.speechTagging) {
natureArray[begin] = value.nature[0];
}
}
}
});
}
LinkedList<Term> termList = new LinkedList<Term>();
if (config.speechTagging) {
for (int i = 0; i < natureArray.length; ) {
if (natureArray[i] == null) {
int j = i + 1;
for (; j < natureArray.length; ++j) {
if (natureArray[j] != null)
break;
}
List<AtomNode> atomNodeList = quickAtomSegment(charArray, i, j);
for (AtomNode atomNode : atomNodeList) {
if (atomNode.sWord.length() >= wordNet[i]) {
wordNet[i] = atomNode.sWord.length();
natureArray[i] = atomNode.getNature();
i += wordNet[i];
}
}
i = j;
} else {
++i;
}
}
}
for (int i = 0; i < wordNet.length; ) {
Term term = new Term(new String(charArray, i, wordNet[i]), config.speechTagging ? (natureArray[i] == null ? Nature.nz : natureArray[i]) : null);
term.offset = i;
termList.add(term);
i += wordNet[i];
}
return termList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class CoreSynonymDictionaryEx method convert.
/**
* 将分词结果转换为同义词列表
* @param sentence 句子
* @param withUndefinedItem 是否保留词典中没有的词语
* @return
*/
public static List<Long[]> convert(List<Term> sentence, boolean withUndefinedItem) {
List<Long[]> synonymItemList = new ArrayList<Long[]>(sentence.size());
for (Term term : sentence) {
// 除掉停用词
if (term.nature == null)
continue;
String nature = term.nature.toString();
char firstChar = nature.charAt(0);
switch(firstChar) {
case 'm':
{
if (!TextUtility.isAllChinese(term.word))
continue;
}
break;
case 'w':
{
continue;
}
}
// 停用词
if (CoreStopWordDictionary.contains(term.word))
continue;
Long[] item = get(term.word);
// logger.trace("{} {}", wordResult.word, Arrays.toString(item));
if (item == null) {
if (withUndefinedItem) {
item = new Long[] { Long.MAX_VALUE / 3 };
synonymItemList.add(item);
}
} else {
synonymItemList.add(item);
}
}
return synonymItemList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class CommonSynonymDictionary method rewrite.
public String rewrite(String text) {
List<Term> termList = StandardTokenizer.segment(text.toCharArray());
StringBuilder sbOut = new StringBuilder((int) (text.length() * 1.2));
String preWord = Predefine.TAG_BIGIN;
for (Term term : termList) {
SynonymItem synonymItem = get(term.word);
Synonym synonym;
if (synonymItem != null && (synonym = synonymItem.randomSynonym(Type.EQUAL, preWord)) != null) {
sbOut.append(synonym.realWord);
} else
sbOut.append(term.word);
preWord = PosTagCompiler.compile(term.nature.toString(), term.word);
}
return sbOut.toString();
}
Aggregations