Search in sources :

Example 51 with SparseVector

use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.

the class DocCountVectorizerModelMapperTest method testOutputCol.

@Test
public void testOutputCol() throws Exception {
    Row[] rows = new Row[] { Row.of(0L, "{\"minTF\":\"1.0\",\"featureType\":\"\\\"TF\\\"\"}"), Row.of(1048576L, "{\"f0\":\"i\",\"f1\":0.6931471805599453,\"f2\":6}"), Row.of(2097152L, "{\"f0\":\"e\",\"f1\":0.1823215567939546,\"f2\":2}"), Row.of(3145728L, "{\"f0\":\"a\",\"f1\":0.4054651081081644,\"f2\":0}"), Row.of(4194304L, "{\"f0\":\"b\",\"f1\":0.1823215567939546,\"f2\":1}"), Row.of(5242880L, "{\"f0\":\"c\",\"f1\":0.6931471805599453,\"f2\":7}"), Row.of(6291456L, "{\"f0\":\"h\",\"f1\":0.4054651081081644,\"f2\":3}"), Row.of(7340032L, "{\"f0\":\"d\",\"f1\":0.6931471805599453,\"f2\":4}"), Row.of(8388608L, "{\"f0\":\"j\",\"f1\":0.6931471805599453,\"f2\":5}"), Row.of(9437184L, "{\"f0\":\"g\",\"f1\":0.6931471805599453,\"f2\":8}"), Row.of(10485760L, "{\"f0\":\"n\",\"f1\":1.0986122886681098,\"f2\":9}"), Row.of(11534336L, "{\"f0\":\"f\",\"f1\":1.0986122886681098,\"f2\":10}") };
    List<Row> model = Arrays.asList(rows);
    Params params = new Params().set(DocCountVectorizerPredictParams.SELECTED_COL, "sentence").set(DocCountVectorizerPredictParams.OUTPUT_COL, "output");
    DocCountVectorizerModelMapper mapper = new DocCountVectorizerModelMapper(modelSchema, dataSchema, params);
    mapper.loadModel(model);
    assertEquals(mapper.map(Row.of("a b c d e")).getField(1), new SparseVector(11, new int[] { 0, 1, 2, 4, 7 }, new double[] { 0.2, 0.2, 0.2, 0.2, 0.2 }));
}
Also used : DocCountVectorizerPredictParams(com.alibaba.alink.params.nlp.DocCountVectorizerPredictParams) Params(org.apache.flink.ml.api.misc.param.Params) Row(org.apache.flink.types.Row) SparseVector(com.alibaba.alink.common.linalg.SparseVector) Test(org.junit.Test)

Example 52 with SparseVector

use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.

the class DocCountVectorizerModelMapperTest method testTFIDFType.

@Test
public void testTFIDFType() throws Exception {
    Row[] rows = new Row[] { Row.of(0L, "{\"minTF\":\"1.0\",\"featureType\":\"\\\"TF_IDF\\\"\"}"), Row.of(1048576L, "{\"f0\":\"i\",\"f1\":0.6931471805599453,\"f2\":6}"), Row.of(2097152L, "{\"f0\":\"e\",\"f1\":0.1823215567939546,\"f2\":2}"), Row.of(3145728L, "{\"f0\":\"a\",\"f1\":0.4054651081081644,\"f2\":0}"), Row.of(4194304L, "{\"f0\":\"b\",\"f1\":0.1823215567939546,\"f2\":1}"), Row.of(5242880L, "{\"f0\":\"c\",\"f1\":0.6931471805599453,\"f2\":7}"), Row.of(6291456L, "{\"f0\":\"h\",\"f1\":0.4054651081081644,\"f2\":3}"), Row.of(7340032L, "{\"f0\":\"d\",\"f1\":0.6931471805599453,\"f2\":4}"), Row.of(8388608L, "{\"f0\":\"j\",\"f1\":0.6931471805599453,\"f2\":5}"), Row.of(9437184L, "{\"f0\":\"g\",\"f1\":0.6931471805599453,\"f2\":8}"), Row.of(10485760L, "{\"f0\":\"n\",\"f1\":1.0986122886681098,\"f2\":9}"), Row.of(11534336L, "{\"f0\":\"f\",\"f1\":1.0986122886681098,\"f2\":10}") };
    List<Row> model = Arrays.asList(rows);
    Params params = new Params().set(DocCountVectorizerPredictParams.SELECTED_COL, "sentence");
    DocCountVectorizerModelMapper mapper = new DocCountVectorizerModelMapper(modelSchema, dataSchema, params);
    mapper.loadModel(model);
    assertEquals(mapper.map(Row.of("a b c d e")).getField(0), new SparseVector(11, new int[] { 0, 1, 2, 4, 7 }, new double[] { 0.08109302162163289, 0.03646431135879092, 0.03646431135879092, 0.13862943611198905, 0.13862943611198905 }));
}
Also used : DocCountVectorizerPredictParams(com.alibaba.alink.params.nlp.DocCountVectorizerPredictParams) Params(org.apache.flink.ml.api.misc.param.Params) Row(org.apache.flink.types.Row) SparseVector(com.alibaba.alink.common.linalg.SparseVector) Test(org.junit.Test)

Example 53 with SparseVector

use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.

the class DocHashCountVectorizerModelMapperTest method testTFIDF.

@Test
public void testTFIDF() throws Exception {
    Row[] rows = new Row[] { Row.of(0L, "{\"numFeatures\":\"20\",\"minTF\":\"1.0\",\"featureType\":\"\\\"TF_IDF\\\"\"}"), Row.of(1048576L, "{\"16\":0.4054651081081644,\"7\":0.0,\"13\":0.4054651081081644,\"14\":-0.5108256237659907," + "\"15\":-0.2876820724517809}") };
    List<Row> model = Arrays.asList(rows);
    Params params = new Params().set(DocHashCountVectorizerPredictParams.SELECTED_COL, "sentence");
    DocHashCountVectorizerModelMapper mapper = new DocHashCountVectorizerModelMapper(modelSchema, dataSchema, params);
    mapper.loadModel(model);
    assertEquals(mapper.map(Row.of("a b c d a a ")).getField(0), new SparseVector(20, new int[] { 7, 13, 14, 15 }, new double[] { 0.0, 0.06757751801802739, -0.25541281188299536, -0.047947012075296815 }));
    assertEquals(mapper.getOutputSchema(), new TableSchema(new String[] { "sentence" }, new TypeInformation[] { VectorTypes.SPARSE_VECTOR }));
}
Also used : TableSchema(org.apache.flink.table.api.TableSchema) DocCountVectorizerTrainParams(com.alibaba.alink.params.nlp.DocCountVectorizerTrainParams) DocHashCountVectorizerPredictParams(com.alibaba.alink.params.nlp.DocHashCountVectorizerPredictParams) Params(org.apache.flink.ml.api.misc.param.Params) Row(org.apache.flink.types.Row) SparseVector(com.alibaba.alink.common.linalg.SparseVector) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Test(org.junit.Test)

Example 54 with SparseVector

use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.

the class DocHashCountVectorizerModelMapperTest method testWordCount.

@Test
public void testWordCount() throws Exception {
    Row[] rows = new Row[] { Row.of(0L, "{\"numFeatures\":\"20\",\"minTF\":\"1.0\",\"featureType\":\"\\\"WORD_COUNT\\\"\"}"), Row.of(1048576L, "{\"16\":0.4054651081081644,\"7\":0.0,\"13\":0.4054651081081644,\"14\":-0.5108256237659907," + "\"15\":-0.2876820724517809}") };
    List<Row> model = Arrays.asList(rows);
    Params params = new Params().set(DocHashCountVectorizerPredictParams.SELECTED_COL, "sentence");
    DocHashCountVectorizerModelMapper mapper = new DocHashCountVectorizerModelMapper(modelSchema, dataSchema, params);
    mapper.loadModel(model);
    assertEquals(mapper.map(Row.of("a b c d a a ")).getField(0), new SparseVector(20, new int[] { 7, 13, 14, 15 }, new double[] { 1.0, 1.0, 3.0, 1.0 }));
}
Also used : DocCountVectorizerTrainParams(com.alibaba.alink.params.nlp.DocCountVectorizerTrainParams) DocHashCountVectorizerPredictParams(com.alibaba.alink.params.nlp.DocHashCountVectorizerPredictParams) Params(org.apache.flink.ml.api.misc.param.Params) Row(org.apache.flink.types.Row) SparseVector(com.alibaba.alink.common.linalg.SparseVector) Test(org.junit.Test)

Example 55 with SparseVector

use of com.alibaba.alink.common.linalg.SparseVector in project Alink by alibaba.

the class DocHashCountVectorizerModelMapperTest method testIDF.

@Test
public void testIDF() throws Exception {
    Row[] rows = new Row[] { Row.of(0L, "{\"numFeatures\":\"20\",\"minTF\":\"1.0\",\"featureType\":\"\\\"IDF\\\"\"}"), Row.of(1048576L, "{\"16\":0.4054651081081644,\"7\":0.0,\"13\":0.4054651081081644,\"14\":-0.5108256237659907," + "\"15\":-0.2876820724517809}") };
    List<Row> model = Arrays.asList(rows);
    Params params = new Params().set(DocHashCountVectorizerPredictParams.SELECTED_COL, "sentence").set(DocCountVectorizerTrainParams.FEATURE_TYPE, FeatureType.IDF);
    DocHashCountVectorizerModelMapper mapper = new DocHashCountVectorizerModelMapper(modelSchema, dataSchema, params);
    mapper.loadModel(model);
    assertEquals(mapper.map(Row.of("a b c d a a ")).getField(0), new SparseVector(20, new int[] { 7, 13, 14, 15 }, new double[] { 0.0, 0.4054651081081644, -0.5108256237659907, -0.2876820724517809 }));
}
Also used : DocCountVectorizerTrainParams(com.alibaba.alink.params.nlp.DocCountVectorizerTrainParams) DocHashCountVectorizerPredictParams(com.alibaba.alink.params.nlp.DocHashCountVectorizerPredictParams) Params(org.apache.flink.ml.api.misc.param.Params) Row(org.apache.flink.types.Row) SparseVector(com.alibaba.alink.common.linalg.SparseVector) Test(org.junit.Test)

Aggregations

SparseVector (com.alibaba.alink.common.linalg.SparseVector)125 Test (org.junit.Test)63 DenseVector (com.alibaba.alink.common.linalg.DenseVector)60 Params (org.apache.flink.ml.api.misc.param.Params)45 Row (org.apache.flink.types.Row)45 Vector (com.alibaba.alink.common.linalg.Vector)40 TableSchema (org.apache.flink.table.api.TableSchema)27 ArrayList (java.util.ArrayList)21 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)15 HashMap (java.util.HashMap)12 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)12 List (java.util.List)11 DenseMatrix (com.alibaba.alink.common.linalg.DenseMatrix)10 MTable (com.alibaba.alink.common.MTable)7 BaseVectorSummary (com.alibaba.alink.operator.common.statistics.basicstatistic.BaseVectorSummary)6 CollectSinkStreamOp (com.alibaba.alink.operator.stream.sink.CollectSinkStreamOp)6 Map (java.util.Map)6 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)5 VectorAssemblerParams (com.alibaba.alink.params.dataproc.vector.VectorAssemblerParams)5 OneHotPredictParams (com.alibaba.alink.params.feature.OneHotPredictParams)5