use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.
the class AhoCorasickDoubleArrayTrieSegment method segSentence.
@Override
protected List<Term> segSentence(char[] sentence) {
if (trie == null) {
logger.warning("还未加载任何词典");
return Collections.emptyList();
}
final int[] wordNet = new int[sentence.length];
Arrays.fill(wordNet, 1);
final Nature[] natureArray = config.speechTagging ? new Nature[sentence.length] : null;
trie.parseText(sentence, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {
@Override
public void hit(int begin, int end, CoreDictionary.Attribute value) {
int length = end - begin;
if (length > wordNet[begin]) {
wordNet[begin] = length;
if (config.speechTagging) {
natureArray[begin] = value.nature[0];
}
}
}
});
LinkedList<Term> termList = new LinkedList<Term>();
if (config.speechTagging) {
for (int i = 0; i < natureArray.length; ) {
if (natureArray[i] == null) {
int j = i + 1;
for (; j < natureArray.length; ++j) {
if (natureArray[j] != null)
break;
}
List<AtomNode> atomNodeList = quickAtomSegment(sentence, i, j);
for (AtomNode atomNode : atomNodeList) {
if (atomNode.sWord.length() >= wordNet[i]) {
wordNet[i] = atomNode.sWord.length();
natureArray[i] = atomNode.getNature();
i += wordNet[i];
}
}
i = j;
} else {
++i;
}
}
}
for (int i = 0; i < wordNet.length; ) {
Term term = new Term(new String(sentence, i, wordNet[i]), config.speechTagging ? (natureArray[i] == null ? Nature.nz : natureArray[i]) : null);
term.offset = i;
termList.add(term);
i += wordNet[i];
}
return termList;
}
use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.
the class DemoCustomNature method main.
public static void main(String[] args) {
// 对于系统中已有的词性,可以直接获取
Nature pcNature = Nature.fromString("n");
System.out.println(pcNature);
// 此时系统中没有"电脑品牌"这个词性
pcNature = Nature.fromString("电脑品牌");
System.out.println(pcNature);
// 我们可以动态添加一个
pcNature = Nature.create("电脑品牌");
System.out.println(pcNature);
// 可以将它赋予到某个词语
LexiconUtility.setAttribute("苹果电脑", pcNature);
// 或者
LexiconUtility.setAttribute("苹果电脑", "电脑品牌 1000");
// 它们将在分词结果中生效
List<Term> termList = HanLP.segment("苹果电脑可以运行开源阿尔法狗代码吗");
System.out.println(termList);
for (Term term : termList) {
if (term.nature == pcNature)
System.out.printf("找到了 [%s] : %s\n", pcNature, term.word);
}
// 还可以直接插入到用户词典
CustomDictionary.insert("阿尔法狗", "科技名词 1024");
// 依然支持隐马词性标注
StandardTokenizer.SEGMENT.enablePartOfSpeechTagging(true);
termList = HanLP.segment("苹果电脑可以运行开源阿尔法狗代码吗");
System.out.println(termList);
// 如果使用了动态词性之后任何类使用了switch(nature)语句,必须注册每个类:
CustomNatureUtility.registerSwitchClass(DemoCustomNature.class);
for (Term term : termList) {
switch(term.nature) {
case n:
System.out.printf("找到了 [%s] : %s\n", "名词", term.word);
}
}
}
use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.
the class WordBasedGenerativeModelSegment method SplitMiddleSlashFromDigitalWords.
//====================================================================
//如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符,
//那么将此“-”符号从当前词中分离出来。
//例如 “3-4 / 月”需要拆分成“3 / - / 4 / 月”
//====================================================================
private static void SplitMiddleSlashFromDigitalWords(List<Vertex> linkedArray) {
if (linkedArray.size() < 2)
return;
ListIterator<Vertex> listIterator = linkedArray.listIterator();
Vertex next = listIterator.next();
Vertex current = next;
while (listIterator.hasNext()) {
next = listIterator.next();
// System.out.println("current:" + current + " next:" + next);
Nature currentNature = current.getNature();
if (currentNature == Nature.nx && (next.hasNature(Nature.q) || next.hasNature(Nature.n))) {
String[] param = current.realWord.split("-", 1);
if (param.length == 2) {
if (TextUtility.isAllNum(param[0]) && TextUtility.isAllNum(param[1])) {
current = current.copy();
current.realWord = param[0];
current.confirmNature(Nature.m);
listIterator.previous();
listIterator.previous();
listIterator.set(current);
listIterator.next();
listIterator.add(Vertex.newPunctuationInstance("-"));
listIterator.add(Vertex.newNumberInstance(param[1]));
}
}
}
current = next;
}
// logger.trace("杠号识别后:" + Graph.parseResult(linkedArray));
}
use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.
the class TestCustomDictionary method testCustomNature.
public void testCustomNature() throws Exception {
Nature pcNature1 = Nature.create("电脑品牌");
Nature pcNature2 = Nature.create("电脑品牌");
assertEquals(pcNature1, pcNature2);
}
use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.
the class WordNet method add.
/**
* 添加顶点,由原子分词顶点添加
*
* @param line
* @param atomSegment
*/
public void add(int line, List<AtomNode> atomSegment) {
// 将原子部分存入m_segGraph
int offset = 0;
for (//Init the cost array
AtomNode atomNode : //Init the cost array
atomSegment) {
//init the word
String sWord = atomNode.sWord;
Nature nature = Nature.n;
int id = -1;
switch(atomNode.nPOS) {
case Predefine.CT_CHINESE:
break;
case Predefine.CT_INDEX:
case Predefine.CT_NUM:
nature = Nature.m;
sWord = "未##数";
id = CoreDictionary.M_WORD_ID;
break;
case Predefine.CT_DELIMITER:
case Predefine.CT_OTHER:
nature = Nature.w;
break;
case //12021-2129-3121
Predefine.CT_SINGLE:
nature = Nature.nx;
sWord = "未##串";
id = CoreDictionary.X_WORD_ID;
break;
default:
break;
}
// 这些通用符的量级都在10万左右
add(line + offset, new Vertex(sWord, atomNode.sWord, new CoreDictionary.Attribute(nature, 10000), id));
offset += atomNode.sWord.length();
}
}
Aggregations