Search in sources :

Example 21 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class URLTokenizer method segment.

/**
     * 分词
     * @param text 文本
     * @return 分词结果
     */
public static List<Term> segment(String text) {
    List<Term> termList = new LinkedList<Term>();
    Matcher matcher = WEB_URL.matcher(text);
    int begin = 0;
    int end;
    while (matcher.find()) {
        end = matcher.start();
        termList.addAll(SEGMENT.seg(text.substring(begin, end)));
        termList.add(new Term(matcher.group(), Nature.xu));
        begin = matcher.end();
    }
    if (begin < text.length())
        termList.addAll(SEGMENT.seg(text.substring(begin)));
    return termList;
}
Also used : Matcher(java.util.regex.Matcher) Term(com.hankcs.hanlp.seg.common.Term) LinkedList(java.util.LinkedList)

Example 22 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class DemoChineseNameRecognition method main.

public static void main(String[] args) {
    String[] testCase = new String[] { "签约仪式前,秦光荣、李纪恒、仇和等一同会见了参加签约的企业家。", "区长庄木弟新年致辞", "朱立伦:两岸都希望共创双赢 习朱历史会晤在即", "陕西首富吴一坚被带走 与令计划妻子有交集", "据美国之音电台网站4月28日报道,8岁的凯瑟琳·克罗尔(凤甫娟)和很多华裔美国小朋友一样,小小年纪就开始学小提琴了。她的妈妈是位虎妈么?", "凯瑟琳和露西(庐瑞媛),跟她们的哥哥们有一些不同。", "王国强、高峰、汪洋、张朝阳光着头、韩寒、小四", "张浩和胡健康复员回家了", "王总和小丽结婚了", "编剧邵钧林和稽道青说", "这里有关天培的有关事迹", "龚学平等领导说,邓颖超生前杜绝超生" };
    Segment segment = HanLP.newSegment().enableNameRecognize(true);
    for (String sentence : testCase) {
        List<Term> termList = segment.seg(sentence);
        System.out.println(termList);
    }
}
Also used : Term(com.hankcs.hanlp.seg.common.Term) Segment(com.hankcs.hanlp.seg.Segment)

Example 23 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class DemoCustomNature method main.

public static void main(String[] args) {
    // 对于系统中已有的词性,可以直接获取
    Nature pcNature = Nature.fromString("n");
    System.out.println(pcNature);
    // 此时系统中没有"电脑品牌"这个词性
    pcNature = Nature.fromString("电脑品牌");
    System.out.println(pcNature);
    // 我们可以动态添加一个
    pcNature = Nature.create("电脑品牌");
    System.out.println(pcNature);
    // 可以将它赋予到某个词语
    LexiconUtility.setAttribute("苹果电脑", pcNature);
    // 或者
    LexiconUtility.setAttribute("苹果电脑", "电脑品牌 1000");
    // 它们将在分词结果中生效
    List<Term> termList = HanLP.segment("苹果电脑可以运行开源阿尔法狗代码吗");
    System.out.println(termList);
    for (Term term : termList) {
        if (term.nature == pcNature)
            System.out.printf("找到了 [%s] : %s\n", pcNature, term.word);
    }
    // 还可以直接插入到用户词典
    CustomDictionary.insert("阿尔法狗", "科技名词 1024");
    // 依然支持隐马词性标注
    StandardTokenizer.SEGMENT.enablePartOfSpeechTagging(true);
    termList = HanLP.segment("苹果电脑可以运行开源阿尔法狗代码吗");
    System.out.println(termList);
    // 如果使用了动态词性之后任何类使用了switch(nature)语句,必须注册每个类:
    CustomNatureUtility.registerSwitchClass(DemoCustomNature.class);
    for (Term term : termList) {
        switch(term.nature) {
            case n:
                System.out.printf("找到了 [%s] : %s\n", "名词", term.word);
        }
    }
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) Term(com.hankcs.hanlp.seg.common.Term)

Example 24 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class DemoHMMSegment method main.

public static void main(String[] args) {
    // 关闭词性显示
    HanLP.Config.ShowTermNature = false;
    Segment segment = new HMMSegment();
    String[] sentenceArray = new String[] { "HanLP是由一系列模型与算法组成的Java工具包,目标是普及自然语言处理在生产环境中的应用。", // 专业名词有一定辨识能力
    "高锰酸钾,强氧化剂,紫红色晶体,可溶于水,遇乙醇即被还原。常用作消毒剂、水净化剂、氧化剂、漂白剂、毒气吸收剂、二氧化碳精制剂等。", // 非新闻语料
    "《夜晚的骰子》通过描述浅草的舞女在暗夜中扔骰子的情景,寄托了作者对庶民生活区的情感", // 微博
    "这个像是真的[委屈]前面那个打扮太江户了,一点不上品...@hankcs", "鼎泰丰的小笼一点味道也没有...每样都淡淡的...淡淡的,哪有食堂2A的好次", "克里斯蒂娜·克罗尔说:不,我不是虎妈。我全家都热爱音乐,我也鼓励他们这么做。", "今日APPS:Sago Mini Toolbox培养孩子动手能力", "财政部副部长王保安调任国家统计局党组书记", "2.34米男子娶1.53米女粉丝 称夫妻生活没问题", "你看过穆赫兰道吗", "乐视超级手机能否承载贾布斯的生态梦" };
    for (String sentence : sentenceArray) {
        List<Term> termList = segment.seg(sentence);
        System.out.println(termList);
    }
    // 测个速度
    String text = "江西鄱阳湖干枯,中国最大淡水湖变成大草原";
    System.out.println(segment.seg(text));
    long start = System.currentTimeMillis();
    int pressure = 1000;
    for (int i = 0; i < pressure; ++i) {
        segment.seg(text);
    }
    double costTime = (System.currentTimeMillis() - start) / (double) 1000;
    System.out.printf("HMM2分词速度:%.2f字每秒\n", text.length() * pressure / costTime);
}
Also used : Term(com.hankcs.hanlp.seg.common.Term) Segment(com.hankcs.hanlp.seg.Segment) HMMSegment(com.hankcs.hanlp.seg.HMM.HMMSegment) HMMSegment(com.hankcs.hanlp.seg.HMM.HMMSegment)

Example 25 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class TestLDA method testSegmentCorpus.

public void testSegmentCorpus() throws Exception {
    File root = new File("D:\\Doc\\语料库\\搜狗文本分类语料库精简版");
    for (File folder : root.listFiles()) {
        if (folder.isDirectory()) {
            for (File file : folder.listFiles()) {
                System.out.println(file.getAbsolutePath());
                List<Term> termList = HanLP.segment(IOUtil.readTxt(file.getAbsolutePath()));
                StringBuilder sbOut = new StringBuilder();
                for (Term term : termList) {
                    if (CoreStopWordDictionary.shouldInclude(term)) {
                        sbOut.append(term.word).append(" ");
                    }
                }
                IOUtil.saveTxt("D:\\Doc\\语料库\\segmented\\" + folder.getName() + "_" + file.getName(), sbOut.toString());
            }
        }
    }
}
Also used : Term(com.hankcs.hanlp.seg.common.Term) File(java.io.File)

Aggregations

Term (com.hankcs.hanlp.seg.common.Term)48 Segment (com.hankcs.hanlp.seg.Segment)12 DijkstraSegment (com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment)8 LinkedList (java.util.LinkedList)7 CRFSegment (com.hankcs.hanlp.seg.CRF.CRFSegment)5 ResultTerm (com.hankcs.hanlp.seg.common.ResultTerm)5 Vertex (com.hankcs.hanlp.seg.common.Vertex)5 CoNLLSentence (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence)4 CoNLLWord (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord)4 DoubleArrayTrieSegment (com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment)4 ViterbiSegment (com.hankcs.hanlp.seg.Viterbi.ViterbiSegment)4 ArrayList (java.util.ArrayList)4 Nature (com.hankcs.hanlp.corpus.tag.Nature)3 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)3 AhoCorasickDoubleArrayTrie (com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)2 Filter (com.hankcs.hanlp.dictionary.stopword.Filter)2 Table (com.hankcs.hanlp.model.crf.Table)2 HMMSegment (com.hankcs.hanlp.seg.HMM.HMMSegment)2 AtomNode (com.hankcs.hanlp.seg.NShort.Path.AtomNode)2 File (java.io.File)2