Search in sources :

Example 1 with LearnTool

use of org.ansj.dic.LearnTool in project ansj_seg by NLPchina.

the class NewWordFindDemo method main.

public static void main(String[] args) throws IOException {
    BufferedReader reader = IOUtil.getReader("/Users/ansj/Downloads/三国演义.txt", "GBK");
    LearnTool learn = new LearnTool();
    NlpAnalysis nlpAnalysis = new NlpAnalysis(reader).setLearnTool(learn);
    while (nlpAnalysis.next() != null) {
    }
    List<Entry<String, Double>> topTree = learn.getTopTree(0);
    StringBuilder sb = new StringBuilder();
    for (Entry<String, Double> entry : topTree) {
        sb.append(entry.getKey() + "\t" + entry.getValue() + "\n");
    }
    IOUtil.Writer("/Users/ansj/Desktop/result.txt", IOUtil.UTF8, sb.toString());
}
Also used : Entry(java.util.Map.Entry) BufferedReader(java.io.BufferedReader) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) LearnTool(org.ansj.dic.LearnTool)

Example 2 with LearnTool

use of org.ansj.dic.LearnTool in project ansj_seg by NLPchina.

the class NlpDemo method main.

public static void main(String[] args) throws IOException {
    List<String> value = new ArrayList<String>();
    // value.add("《屌丝男士》是搜狐视频自制节目《大鹏嘚吧嘚》除“大鹏剧场秀“之外的第二个衍生品牌,是独立于《大鹏嘚吧嘚》每周播出的迷你剧集,第一季于2012年10月10日首播,每周三更新.该片由赵本山第53位弟子,网络第一主持人大鹏(董成鹏)导演并主演,是一部向德国电视剧《屌丝女士》致敬的喜剧短片.大鹏在片中饰演现实生活中的各种男性,而大鹏的各位明星好友,也在片中有惊喜表演.第一季客串明星:柳岩,李响,刘心,何云伟,李菁,如花李健仁,李亚红,乔衫,修睿,赵铭,于莎莎,司马南,不加V,沈腾等");
    // value.add("二次元乳量大不一定是王道");
    value.add("洁面仪配合洁面深层清洁毛孔 清洁鼻孔面膜碎觉使劲挤才能出一点点皱纹 脸颊毛孔修复的看不见啦 草莓鼻历史遗留问题没辙 脸和脖子差不多颜色的皮肤才是健康的 长期使用安全健康的比同龄人显小五到十岁 28岁的妹子看看你们的鱼尾纹");
    // value.add("搜索日志是理解互联网用户信息的宝贵资源。本文基于搜索日志的特点,提出一种双层识别模型方法识别计算机领域查询串。第一层模型采用贝叶斯模型基于领域词库对查询串进行识别,可以达到较高的准确率,由于日志中查询串长度有限,信息量不足等特点导致一些查询串无法召回;针对如上情况我们提出补充信息维度,即在此基础上对其进行第二层模型训练,主要方法是依据查询串点击的URL信息进行可信度训练,依据查询串召回的URL信息进行行业可信度训练计算,以达到召回了更多计算机领域查询串的目的。实验结果表明,双层模型识别后结果不但在准确率上得到保障,并比第一层模型的召回率提高了20个百分点,达到了78%的召回率和96%的精准率。此方法迅速而准确的识别出计算机类别查询串,对其他领域查询识别及查询意图分类具有借鉴意义。");
    // value.add("贾瑞听了,魂不附体,只说:“好侄儿,只说没有见我,明日我重重的谢你。”贾蔷道:“你若谢我,放你不值什么,只不知你谢我多少?况且口说无凭,写一文契来。”贾瑞道:“这如何落纸呢?\"贾蔷道:“这也不妨,写一个赌钱输了外人帐目,借头家银若干两便罢。”贾瑞道:“这也容易.只是此时无纸笔。”贾蔷道:“这也容易。”说罢翻身出来,纸笔现成,拿来命贾瑞写.他两作好作歹,只写了五十两,然后画了押,贾蔷收起来.然后撕逻贾蓉.贾蓉先咬定牙不依,只说:“明日告诉族中的人评评理。”贾瑞急的至于叩头.贾蔷作好作歹的,也写了一张五十两欠契才罢.贾蔷又道:“如今要放你,我就担着不是.老太太那边的门早已关了,老爷正在厅上看南京的东西,那一条路定难过去,如今只好走后门.若这一走,倘或遇见了人,连我也完了.等我们先去哨探哨探,再来领你.这屋你还藏不得,少时就来堆东西.等我寻个地方。”说毕,拉着贾瑞,仍熄了灯,出至院外,摸着大台矶底下,说道:“这窝儿里好,你只蹲着,别哼一声,等我们来再动。”说毕,二人去了");
    // value.add("接了个小私活,帮一个初中高中连读的中学做一个学生日常考评系统,就是记录迟到、早退、违纪什么的一个系统,由班主任管理记录,还要有什么表扬榜的。对于报价不了解,不知道该报多少,大家说说看,多少合适?");
    // value.add("若雅虎关闭了,我就不访问网站了!");
    // 学习机器是有状态的
    long start = System.currentTimeMillis();
    // 此对象可以公用一个.随着语料的增多可以学习新的词语
    LearnTool learn = new LearnTool();
    // 关闭人名识别
    learn.isAsianName = true;
    // 关闭外国人名识别
    learn.isForeignName = true;
    for (String string : value) {
        Result parse = new NlpAnalysis().setLearnTool(learn).parseStr(string);
        System.out.println(parse);
    }
    System.out.println("这次训练已经学到了: " + learn.count + " 个词!");
    System.out.println(System.currentTimeMillis() - start);
    System.out.println(learn.getTopTree(100));
}
Also used : ArrayList(java.util.ArrayList) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) LearnTool(org.ansj.dic.LearnTool) Result(org.ansj.domain.Result)

Example 3 with LearnTool

use of org.ansj.dic.LearnTool in project ansj_seg by NLPchina.

the class NlpAnalysis method getResult.

@Override
protected List<Term> getResult(final Graph graph) {
    Merger merger = new Merger() {

        @Override
        public List<Term> merger() {
            if (learn == null) {
                learn = new LearnTool();
            }
            graph.walkPath();
            learn.learn(graph, splitWord, forests);
            // 姓名识别
            if (graph.hasPerson && isNameRecognition) {
                // 亚洲人名识别
                new AsianPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
                NameFix.nameAmbiguity(graph.terms);
                // 外国人名识别
                new ForeignPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
            }
            if (splitWord != null) {
                MapCount<String> mc = new MapCount<String>();
                // 通过crf分词
                List<String> words = splitWord.cut(graph.chars);
                Term tempTerm = null;
                int tempOff = 0;
                if (words.size() > 0) {
                    String word = words.get(0);
                    if (!isRuleWord(word)) {
                        mc.add("始##始" + TAB + word, CRF_WEIGHT);
                    }
                }
                for (String word : words) {
                    // 尝试从词典获取词性
                    TermNatures termNatures = new NatureRecognition(forests).getTermNatures(word);
                    Term term = null;
                    if (termNatures != TermNatures.NULL) {
                        term = new Term(word, tempOff, termNatures);
                    } else {
                        term = new Term(word, tempOff, TermNatures.NW);
                        term.setNewWord(true);
                    }
                    // 增加偏移量
                    tempOff += word.length();
                    if (isRuleWord(word)) {
                        // 如果word不对那么不要了
                        tempTerm = null;
                        continue;
                    }
                    if (term.isNewWord()) {
                        // 尝试猜测词性
                        termNatures = NatureRecognition.guessNature(word);
                        term.updateTermNaturesAndNature(termNatures);
                    }
                    TermUtil.insertTerm(graph.terms, term, InsertTermType.SCORE_ADD_SORT);
                    // 对于非词典中的词持有保守态度
                    if (tempTerm != null && !tempTerm.isNewWord() && !term.isNewWord()) {
                        mc.add(tempTerm.getName() + TAB + word, CRF_WEIGHT);
                    }
                    tempTerm = term;
                    if (term.isNewWord()) {
                        learn.addTerm(new NewWord(word, Nature.NW));
                    }
                }
                if (tempTerm != null && !tempTerm.isNewWord()) {
                    mc.add(tempTerm.getName() + TAB + "末##末", CRF_WEIGHT);
                }
                graph.walkPath(mc.get());
            } else {
                LOG.warn("not find any crf model, make sure your config right? ");
            }
            // 数字发现
            if (graph.hasNum && isNumRecognition) {
                new NumRecognition().recognition(graph.terms);
            }
            // 词性标注
            List<Term> result = getResult();
            // 用户自定义词典的识别
            new UserDefineRecognition(InsertTermType.SCORE_ADD_SORT, forests).recognition(graph.terms);
            graph.rmLittlePath();
            graph.walkPathByScore();
            // 进行新词发现
            new NewWordRecognition(learn).recognition(graph.terms);
            graph.walkPathByScore();
            // 优化后重新获得最优路径
            result = getResult();
            // 激活辞典
            for (Term term : result) {
                learn.active(term.getName());
            }
            setRealName(graph, result);
            return result;
        }

        private List<Term> getResult() {
            List<Term> result = new ArrayList<Term>();
            int length = graph.terms.length - 1;
            for (int i = 0; i < length; i++) {
                if (graph.terms[i] == null) {
                    continue;
                }
                result.add(graph.terms[i]);
            }
            return result;
        }
    };
    return merger.merger();
}
Also used : TermNatures(org.ansj.domain.TermNatures) ArrayList(java.util.ArrayList) MapCount(org.nlpcn.commons.lang.util.MapCount) NewWordRecognition(org.ansj.recognition.arrimpl.NewWordRecognition) Term(org.ansj.domain.Term) AsianPersonRecognition(org.ansj.recognition.arrimpl.AsianPersonRecognition) NumRecognition(org.ansj.recognition.arrimpl.NumRecognition) ForeignPersonRecognition(org.ansj.recognition.arrimpl.ForeignPersonRecognition) UserDefineRecognition(org.ansj.recognition.arrimpl.UserDefineRecognition) NatureRecognition(org.ansj.recognition.impl.NatureRecognition) LearnTool(org.ansj.dic.LearnTool) NewWord(org.ansj.domain.NewWord)

Example 4 with LearnTool

use of org.ansj.dic.LearnTool in project ansj_seg by NLPchina.

the class LearnToolDemo method main.

public static void main(String[] args) throws FileNotFoundException, IOException, ClassNotFoundException {
    // 构建一个新词学习的工具类。这个对象。保存了所有分词中出现的新词。出现次数越多。相对权重越大。
    LearnTool learnTool = new LearnTool();
    NlpAnalysis nlpAnalysis = new NlpAnalysis().setLearnTool(learnTool);
    // 进行词语分词。也就是nlp方式分词,这里可以分多篇文章
    nlpAnalysis.parseStr("说过,社交软件也是打着沟通的平台,让无数寂寞男女有了肉体与精神的寄托。");
    nlpAnalysis.parseStr("其实可以打着这个需求点去运作的互联网公司不应只是社交类软件与可穿戴设备,还有携程网,去哪儿网等等,订房订酒店多好的寓意");
    nlpAnalysis.parseStr("张艺谋的卡宴,马明哲的戏");
    // 取得学习到的topn新词,返回前10个。这里如果设置为0则返回全部
    System.out.println(learnTool.getTopTree(10));
    // 只取得词性为Nature.NR的新词
    System.out.println(learnTool.getTopTree(10, Nature.NR));
    /**
		 * 将训练结果序列写入到硬盘中
		 */
    List<Entry<String, Double>> topTree = learnTool.getTopTree(0);
    StringBuilder sb = new StringBuilder();
    for (Entry<String, Double> entry : topTree) {
        sb.append(entry.getKey() + "\t" + entry.getValue() + "\n");
    }
    IOUtil.Writer("learnTool.snap", IOUtil.UTF8, sb.toString());
    sb = null;
    /**
		 * reload训练结果
		 */
    learnTool = new LearnTool();
    HashMap<String, Double> loadMap = IOUtil.loadMap("learnTool.snap", IOUtil.UTF8, String.class, Double.class);
    for (Entry<String, Double> entry : loadMap.entrySet()) {
        learnTool.addTerm(new NewWord(entry.getKey(), Nature.NW, entry.getValue()));
        learnTool.active(entry.getKey());
    }
    System.out.println(learnTool.getTopTree(10));
    new File("learnTool.snap").delete();
}
Also used : Entry(java.util.Map.Entry) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) File(java.io.File) LearnTool(org.ansj.dic.LearnTool) NewWord(org.ansj.domain.NewWord)

Aggregations

LearnTool (org.ansj.dic.LearnTool)4 NlpAnalysis (org.ansj.splitWord.analysis.NlpAnalysis)3 ArrayList (java.util.ArrayList)2 Entry (java.util.Map.Entry)2 NewWord (org.ansj.domain.NewWord)2 BufferedReader (java.io.BufferedReader)1 File (java.io.File)1 Result (org.ansj.domain.Result)1 Term (org.ansj.domain.Term)1 TermNatures (org.ansj.domain.TermNatures)1 AsianPersonRecognition (org.ansj.recognition.arrimpl.AsianPersonRecognition)1 ForeignPersonRecognition (org.ansj.recognition.arrimpl.ForeignPersonRecognition)1 NewWordRecognition (org.ansj.recognition.arrimpl.NewWordRecognition)1 NumRecognition (org.ansj.recognition.arrimpl.NumRecognition)1 UserDefineRecognition (org.ansj.recognition.arrimpl.UserDefineRecognition)1 NatureRecognition (org.ansj.recognition.impl.NatureRecognition)1 MapCount (org.nlpcn.commons.lang.util.MapCount)1