Examples with DocHashCountVectorizer - com.alibaba.alink.pipeline.nlp.DocHashCountVectorizer

Example 1 with DocHashCountVectorizer

use of com.alibaba.alink.pipeline.nlp.DocHashCountVectorizer in project Alink by alibaba.

the class Chap23 method c_2.

static void c_2() throws Exception {
    if (!new File(DATA_DIR + TRAIN_FILE).exists()) {
        ArrayList<Row> trainRows = new ArrayList<>();
        ArrayList<Row> testRows = new ArrayList<>();
        for (String label : new String[] { "pos", "neg" }) {
            File subfolder = new File(ORIGIN_DATA_DIR + "train" + File.separator + label);
            for (File f : subfolder.listFiles()) {
                trainRows.add(Row.of(label, readFileContent(f)));
            }
        }
        for (String label : new String[] { "pos", "neg" }) {
            File subfolder = new File(ORIGIN_DATA_DIR + "test" + File.separator + label);
            for (File f : subfolder.listFiles()) {
                testRows.add(Row.of(label, readFileContent(f)));
            }
        }
        new MemSourceBatchOp(trainRows, COL_NAMES).link(new AkSinkBatchOp().setFilePath(DATA_DIR + TRAIN_FILE));
        new MemSourceBatchOp(testRows, COL_NAMES).link(new AkSinkBatchOp().setFilePath(DATA_DIR + TEST_FILE));
        BatchOperator.execute();
    }
    AkSourceBatchOp train_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    AkSourceBatchOp test_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    train_set.lazyPrint(2);
    new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME).enableLazyPrintTransformData(1)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("DocCountVectorizer"));
    BatchOperator.execute();
    new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocHashCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME).enableLazyPrintTransformData(1)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("DocHashCountVectorizer"));
    BatchOperator.execute();
}

Also used : ArrayList(java.util.ArrayList) DocCountVectorizer(com.alibaba.alink.pipeline.nlp.DocCountVectorizer) Pipeline(com.alibaba.alink.pipeline.Pipeline) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp) MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) RegexTokenizer(com.alibaba.alink.pipeline.nlp.RegexTokenizer) Row(org.apache.flink.types.Row) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) File(java.io.File) DocHashCountVectorizer(com.alibaba.alink.pipeline.nlp.DocHashCountVectorizer)

Example 2 with DocHashCountVectorizer

use of com.alibaba.alink.pipeline.nlp.DocHashCountVectorizer in project Alink by alibaba.

the class Chap21 method c_4.

static void c_4() throws Exception {
    BatchOperator.setParallelism(1);
    BatchOperator titles = getSource().firstN(10).select("news_title").link(new SegmentBatchOp().setSelectedCol("news_title").setOutputCol("segmented_title").setReservedCols(new String[] {}));
    for (String featureType : new String[] { "WORD_COUNT", "BINARY", "TF", "IDF", "TF_IDF" }) {
        new DocCountVectorizer().setFeatureType(featureType).setSelectedCol("segmented_title").setOutputCol("vec").fit(titles).transform(titles).lazyPrint(-1, "DocCountVectorizer + " + featureType);
    }
    for (String featureType : new String[] { "WORD_COUNT", "BINARY", "TF", "IDF", "TF_IDF" }) {
        new DocHashCountVectorizer().setFeatureType(featureType).setSelectedCol("segmented_title").setOutputCol("vec").setNumFeatures(100).fit(titles).transform(titles).lazyPrint(-1, "DocHashCountVectorizer + " + featureType);
    }
    BatchOperator.execute();
}

Also used : SegmentBatchOp(com.alibaba.alink.operator.batch.nlp.SegmentBatchOp) DocCountVectorizer(com.alibaba.alink.pipeline.nlp.DocCountVectorizer) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) DocHashCountVectorizer(com.alibaba.alink.pipeline.nlp.DocHashCountVectorizer)

Aggregations

DocCountVectorizer (com.alibaba.alink.pipeline.nlp.DocCountVectorizer)2 DocHashCountVectorizer (com.alibaba.alink.pipeline.nlp.DocHashCountVectorizer)2 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)1 EvalBinaryClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)1 SegmentBatchOp (com.alibaba.alink.operator.batch.nlp.SegmentBatchOp)1 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)1 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)1 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)1 Pipeline (com.alibaba.alink.pipeline.Pipeline)1 LogisticRegression (com.alibaba.alink.pipeline.classification.LogisticRegression)1 RegexTokenizer (com.alibaba.alink.pipeline.nlp.RegexTokenizer)1 File (java.io.File)1 ArrayList (java.util.ArrayList)1 Row (org.apache.flink.types.Row)1