Search in sources :

Example 1 with WordCountBatchOp

use of com.alibaba.alink.operator.batch.nlp.WordCountBatchOp in project Alink by alibaba.

the class Chap21 method c_3.

static void c_3() throws Exception {
    BatchOperator.setParallelism(1);
    BatchOperator titles = getSource().firstN(10).select("news_title").link(new SegmentBatchOp().setSelectedCol("news_title").setOutputCol("segmented_title").setReservedCols(new String[] {}));
    titles.link(new WordCountBatchOp().setSelectedCol("segmented_title")).orderBy("cnt", 100, false).lazyPrint(-1, "WordCount");
    titles.link(new DocWordCountBatchOp().setDocIdCol("segmented_title").setContentCol("segmented_title")).lazyPrint(-1, "DocWordCount");
    BatchOperator.execute();
}
Also used : WordCountBatchOp(com.alibaba.alink.operator.batch.nlp.WordCountBatchOp) DocWordCountBatchOp(com.alibaba.alink.operator.batch.nlp.DocWordCountBatchOp) SegmentBatchOp(com.alibaba.alink.operator.batch.nlp.SegmentBatchOp) DocWordCountBatchOp(com.alibaba.alink.operator.batch.nlp.DocWordCountBatchOp) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator)

Example 2 with WordCountBatchOp

use of com.alibaba.alink.operator.batch.nlp.WordCountBatchOp in project Alink by alibaba.

the class Chap22 method c_2.

static void c_2() throws Exception {
    BatchOperator.setParallelism(1);
    TextSourceBatchOp source = new TextSourceBatchOp().setFilePath(DATA_DIR + ORIGIN_FILE);
    source.lazyPrint(8);
    final String[] CHARACTER_DICT = new String[] { "曹操", "孔明", "玄德", "刘玄德", "刘备", "关羽", "张飞", "赵云", "曹孟德", "诸葛亮", "张郃", "孙权", "张辽", "鲁肃" };
    source.link(new SegmentBatchOp().setSelectedCol("text").setUserDefinedDict(CHARACTER_DICT)).link(new StopWordsRemoverBatchOp().setSelectedCol("text")).link(new WordCountBatchOp().setSelectedCol("text")).orderBy("cnt", 100, false).print();
    if (!new File(DATA_DIR + W2V_MODEL_FILE).exists()) {
        source.link(new SegmentBatchOp().setSelectedCol("text").setUserDefinedDict(CHARACTER_DICT)).link(new StopWordsRemoverBatchOp().setSelectedCol("text").setStopWords("亦", "曰", "遂", "吾", "已", "去", "二人", "今", "使", "中", "知", "不", "见", "都", "令", "却", "欲", "请", "人", "谓", "不可", "闻", "前", "后", "皆", "便", "问", "日", "时", "耳", "不敢", "问", "回", "才", "之事", "之人", "之时", "料", "今日", "令人", "受", "说", "出", "已毕", "不得", "使人", "众", "何不", "不知", "再", "处", "无", "即日", "诸", "此时", "只", "下", "还", "上", "杀", "将军", "却说", "兵", "汝", "走", "言", "寨", "不能", "斩", "死", "商议", "听", "军士", "军", "左右", "军马", "引兵", "次日", "二", "看", "耶", "退", "更", "毕", "正", "一人", "原来", "大笑", "车胄", "口", "引", "大喜", "其事", "助", "事", "未", "大", "至此", "讫", "心中", "敢")).link(new Word2VecTrainBatchOp().setSelectedCol("text").setMinCount(10).setNumIter(50)).link(new AkSinkBatchOp().setFilePath(DATA_DIR + W2V_MODEL_FILE));
        BatchOperator.execute();
    }
    AkSourceBatchOp word2vec = new AkSourceBatchOp().setFilePath(DATA_DIR + W2V_MODEL_FILE);
    new VectorNearestNeighbor().setIdCol("word").setSelectedCol("vec").setTopN(20).setOutputCol("similar_words").fit(word2vec).transform(word2vec.filter("word IN ('曹操', '操', '玄德', '刘备', '孔明', " + "'亮', '卧龙', '周瑜', '吕布', '貂蝉', '云长', '孙权')")).select("word, similar_words").print();
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) WordCountBatchOp(com.alibaba.alink.operator.batch.nlp.WordCountBatchOp) Word2VecTrainBatchOp(com.alibaba.alink.operator.batch.nlp.Word2VecTrainBatchOp) StopWordsRemoverBatchOp(com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp) SegmentBatchOp(com.alibaba.alink.operator.batch.nlp.SegmentBatchOp) VectorNearestNeighbor(com.alibaba.alink.pipeline.similarity.VectorNearestNeighbor) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) TextSourceBatchOp(com.alibaba.alink.operator.batch.source.TextSourceBatchOp) File(java.io.File)

Aggregations

SegmentBatchOp (com.alibaba.alink.operator.batch.nlp.SegmentBatchOp)2 WordCountBatchOp (com.alibaba.alink.operator.batch.nlp.WordCountBatchOp)2 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)1 DocWordCountBatchOp (com.alibaba.alink.operator.batch.nlp.DocWordCountBatchOp)1 StopWordsRemoverBatchOp (com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp)1 Word2VecTrainBatchOp (com.alibaba.alink.operator.batch.nlp.Word2VecTrainBatchOp)1 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)1 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)1 TextSourceBatchOp (com.alibaba.alink.operator.batch.source.TextSourceBatchOp)1 VectorNearestNeighbor (com.alibaba.alink.pipeline.similarity.VectorNearestNeighbor)1 File (java.io.File)1