use of com.alibaba.alink.operator.batch.nlp.WordCountBatchOp in project Alink by alibaba.
the class Chap21 method c_3.
static void c_3() throws Exception {
BatchOperator.setParallelism(1);
BatchOperator titles = getSource().firstN(10).select("news_title").link(new SegmentBatchOp().setSelectedCol("news_title").setOutputCol("segmented_title").setReservedCols(new String[] {}));
titles.link(new WordCountBatchOp().setSelectedCol("segmented_title")).orderBy("cnt", 100, false).lazyPrint(-1, "WordCount");
titles.link(new DocWordCountBatchOp().setDocIdCol("segmented_title").setContentCol("segmented_title")).lazyPrint(-1, "DocWordCount");
BatchOperator.execute();
}
use of com.alibaba.alink.operator.batch.nlp.WordCountBatchOp in project Alink by alibaba.
the class Chap22 method c_2.
static void c_2() throws Exception {
BatchOperator.setParallelism(1);
TextSourceBatchOp source = new TextSourceBatchOp().setFilePath(DATA_DIR + ORIGIN_FILE);
source.lazyPrint(8);
final String[] CHARACTER_DICT = new String[] { "曹操", "孔明", "玄德", "刘玄德", "刘备", "关羽", "张飞", "赵云", "曹孟德", "诸葛亮", "张郃", "孙权", "张辽", "鲁肃" };
source.link(new SegmentBatchOp().setSelectedCol("text").setUserDefinedDict(CHARACTER_DICT)).link(new StopWordsRemoverBatchOp().setSelectedCol("text")).link(new WordCountBatchOp().setSelectedCol("text")).orderBy("cnt", 100, false).print();
if (!new File(DATA_DIR + W2V_MODEL_FILE).exists()) {
source.link(new SegmentBatchOp().setSelectedCol("text").setUserDefinedDict(CHARACTER_DICT)).link(new StopWordsRemoverBatchOp().setSelectedCol("text").setStopWords("亦", "曰", "遂", "吾", "已", "去", "二人", "今", "使", "中", "知", "不", "见", "都", "令", "却", "欲", "请", "人", "谓", "不可", "闻", "前", "后", "皆", "便", "问", "日", "时", "耳", "不敢", "问", "回", "才", "之事", "之人", "之时", "料", "今日", "令人", "受", "说", "出", "已毕", "不得", "使人", "众", "何不", "不知", "再", "处", "无", "即日", "诸", "此时", "只", "下", "还", "上", "杀", "将军", "却说", "兵", "汝", "走", "言", "寨", "不能", "斩", "死", "商议", "听", "军士", "军", "左右", "军马", "引兵", "次日", "二", "看", "耶", "退", "更", "毕", "正", "一人", "原来", "大笑", "车胄", "口", "引", "大喜", "其事", "助", "事", "未", "大", "至此", "讫", "心中", "敢")).link(new Word2VecTrainBatchOp().setSelectedCol("text").setMinCount(10).setNumIter(50)).link(new AkSinkBatchOp().setFilePath(DATA_DIR + W2V_MODEL_FILE));
BatchOperator.execute();
}
AkSourceBatchOp word2vec = new AkSourceBatchOp().setFilePath(DATA_DIR + W2V_MODEL_FILE);
new VectorNearestNeighbor().setIdCol("word").setSelectedCol("vec").setTopN(20).setOutputCol("similar_words").fit(word2vec).transform(word2vec.filter("word IN ('曹操', '操', '玄德', '刘备', '孔明', " + "'亮', '卧龙', '周瑜', '吕布', '貂蝉', '云长', '孙权')")).select("word, similar_words").print();
}
Aggregations