use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class CRFSegment method toVertexList.
private static List<Vertex> toVertexList(List<Term> termList, boolean appendStart) {
ArrayList<Vertex> vertexList = new ArrayList<Vertex>(termList.size() + 1);
if (appendStart)
vertexList.add(Vertex.B);
for (Term term : termList) {
CoreDictionary.Attribute attribute = CoreDictionary.get(term.word);
if (attribute == null) {
if (term.word.trim().length() == 0)
attribute = new CoreDictionary.Attribute(Nature.x);
else
attribute = new CoreDictionary.Attribute(Nature.nz);
} else
term.nature = attribute.nature[0];
Vertex vertex = new Vertex(term.word, attribute);
vertexList.add(vertex);
}
return vertexList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class CRFSegment method segSentence.
@Override
protected List<Term> segSentence(char[] sentence) {
if (sentence.length == 0)
return Collections.emptyList();
char[] sentenceConverted = CharTable.convert(sentence);
Table table = new Table();
table.v = atomSegmentToTable(sentenceConverted);
crfModel.tag(table);
List<Term> termList = new LinkedList<Term>();
if (HanLP.Config.DEBUG) {
System.out.println("CRF标注结果");
System.out.println(table);
}
int offset = 0;
OUTER: for (int i = 0; i < table.v.length; offset += table.v[i][1].length(), ++i) {
String[] line = table.v[i];
switch(line[2].charAt(0)) {
case 'B':
{
int begin = offset;
while (table.v[i][2].charAt(0) != 'E') {
offset += table.v[i][1].length();
++i;
if (i == table.v.length) {
break;
}
}
if (i == table.v.length) {
termList.add(new Term(new String(sentence, begin, offset - begin), null));
break OUTER;
} else
termList.add(new Term(new String(sentence, begin, offset - begin + table.v[i][1].length()), null));
}
break;
default:
{
termList.add(new Term(new String(sentence, offset, table.v[i][1].length()), null));
}
break;
}
}
if (config.speechTagging) {
List<Vertex> vertexList = toVertexList(termList, true);
Viterbi.compute(vertexList, CoreDictionaryTransformMatrixDictionary.transformMatrixDictionary);
int i = 0;
for (Term term : termList) {
if (term.nature != null)
term.nature = vertexList.get(i + 1).guessNature();
++i;
}
}
if (config.useCustomDictionary) {
List<Vertex> vertexList = toVertexList(termList, false);
combineByCustomDictionary(vertexList);
termList = toTermList(vertexList, config.offset);
}
return termList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class DemoPlaceRecognition method main.
public static void main(String[] args) {
String[] testCase = new String[] { "蓝翔给宁夏固原市彭阳县红河镇黑牛沟村捐赠了挖掘机" };
Segment segment = HanLP.newSegment().enablePlaceRecognize(true);
for (String sentence : testCase) {
List<Term> termList = segment.seg(sentence);
System.out.println(termList);
}
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class DemoStopWord method main.
public static void main(String[] args) {
String text = "小区居民有的反对喂养流浪猫,而有的居民却赞成喂养这些小宝贝";
// 可以动态修改停用词词典
CoreStopWordDictionary.add("居民");
System.out.println(NotionalTokenizer.segment(text));
CoreStopWordDictionary.remove("居民");
System.out.println(NotionalTokenizer.segment(text));
// 可以对任意分词器的结果执行过滤
List<Term> termList = BasicTokenizer.segment(text);
System.out.println(termList);
CoreStopWordDictionary.apply(termList);
System.out.println(termList);
// 还可以自定义过滤逻辑
CoreStopWordDictionary.FILTER = new Filter() {
@Override
public boolean shouldInclude(Term term) {
switch(term.nature) {
case nz:
return !CoreStopWordDictionary.contains(term.word);
}
return false;
}
};
System.out.println(NotionalTokenizer.segment(text));
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class DemoTranslatedNameRecognition method main.
public static void main(String[] args) {
String[] testCase = new String[] { "一桶冰水当头倒下,微软的比尔盖茨、Facebook的扎克伯格跟桑德博格、亚马逊的贝索斯、苹果的库克全都不惜湿身入镜,这些硅谷的科技人,飞蛾扑火似地牺牲演出,其实全为了慈善。", "世界上最长的姓名是简森·乔伊·亚历山大·比基·卡利斯勒·达夫·埃利奥特·福克斯·伊维鲁莫·马尔尼·梅尔斯·帕特森·汤普森·华莱士·普雷斯顿。" };
Segment segment = HanLP.newSegment().enableTranslatedNameRecognize(true);
for (String sentence : testCase) {
List<Term> termList = segment.seg(sentence);
System.out.println(termList);
}
}
Aggregations