Search in sources :

Example 1 with Imputer

use of com.alibaba.alink.pipeline.dataproc.Imputer in project Alink by alibaba.

the class Chap07 method c_5.

static void c_5() throws Exception {
    Row[] rows = new Row[] { Row.of("a", 10.0, 100), Row.of("b", -2.5, 9), Row.of("c", 100.2, 1), Row.of("d", -99.9, 100), Row.of(null, null, null) };
    MemSourceBatchOp source = new MemSourceBatchOp(rows, new String[] { "col1", "col2", "col3" });
    source.lazyPrint(-1, "< origin data >");
    Pipeline pipeline = new Pipeline().add(new Imputer().setSelectedCols("col1").setStrategy(Strategy.VALUE).setFillValue("e")).add(new Imputer().setSelectedCols("col2", "col3").setStrategy(Strategy.MEAN));
    pipeline.fit(source).transform(source).print();
    System.out.println(210 / 4);
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) Imputer(com.alibaba.alink.pipeline.dataproc.Imputer) Row(org.apache.flink.types.Row) Pipeline(com.alibaba.alink.pipeline.Pipeline)

Example 2 with Imputer

use of com.alibaba.alink.pipeline.dataproc.Imputer in project Alink by alibaba.

the class Chap11 method c_3.

static void c_3() throws Exception {
    BatchOperator<?> source = getSource();
    BatchOperator t1 = source.filter("ts < CAST('2014-07-16 00:00:00' AS TIMESTAMP)");
    BatchOperator t2 = source.filter("ts >= CAST('2014-07-16 00:00:00' AS TIMESTAMP)");
    t1.lazyPrint(3, "[ ts < '2014-07-16 00:00:00' ]").lazyPrintStatistics();
    t2.lazyPrint(3, "[ ts >= '2014-07-16 00:00:00' ]").lazyPrintStatistics();
    BatchOperator.execute();
    String clausePreProc = "user_id, brand_id, type, ts, past_days," + "case when type=0 then 1 else 0 end AS is_click," + "case when type=1 then 1 else 0 end AS is_buy," + "case when type=2 then 1 else 0 end AS is_collect," + "case when type=3 then 1 else 0 end AS is_cart," + "case when type=0 and past_days<=30 then 1 else 0 end AS is_click_1m," + "case when type=1 and past_days<=30 then 1 else 0 end AS is_buy_1m," + "case when type=2 and past_days<=30 then 1 else 0 end AS is_collect_1m," + "case when type=3 and past_days<=30 then 1 else 0 end AS is_cart_1m," + "case when type=0 and past_days<=60 then 1 else 0 end AS is_click_2m," + "case when type=1 and past_days<=60 then 1 else 0 end AS is_buy_2m," + "case when type=2 and past_days<=60 then 1 else 0 end AS is_collect_2m," + "case when type=3 and past_days<=60 then 1 else 0 end AS is_cart_2m," + "case when type=0 and past_days<=90 then 1 else 0 end AS is_click_3m," + "case when type=1 and past_days<=90 then 1 else 0 end AS is_buy_3m," + "case when type=2 and past_days<=90 then 1 else 0 end AS is_collect_3m," + "case when type=3 and past_days<=90 then 1 else 0 end AS is_cart_3m," + "case when type=0 and past_days>30 and past_days<=60 then 1 else 0 end AS is_click_m2nd," + "case when type=1 and past_days>30 and past_days<=60 then 1 else 0 end AS is_buy_m2nd," + "case when type=2 and past_days>30 and past_days<=60 then 1 else 0 end AS is_collect_m2nd," + "case when type=3 and past_days>30 and past_days<=60 then 1 else 0 end AS is_cart_m2nd," + "case when type=0 and past_days>60 and past_days<=90 then 1 else 0 end AS is_click_m3th," + "case when type=1 and past_days>60 and past_days<=90 then 1 else 0 end AS is_buy_m3th," + "case when type=2 and past_days>60 and past_days<=90 then 1 else 0 end AS is_collect_m3th," + "case when type=3 and past_days>60 and past_days<=90 then 1 else 0 end AS is_cart_m3th," + "case when type=0 and past_days<=3 then 1 else 0 end AS is_click_3d," + "case when type=1 and past_days<=3 then 1 else 0 end AS is_buy_3d," + "case when type=2 and past_days<=3 then 1 else 0 end AS is_collect_3d," + "case when type=3 and past_days<=3 then 1 else 0 end AS is_cart_3d," + "case when type=0 and past_days>3 and past_days<=6 then 1 else 0 end AS is_click_3d2nd," + "case when type=1 and past_days>3 and past_days<=6 then 1 else 0 end AS is_buy_3d2nd," + "case when type=2 and past_days>3 and past_days<=6 then 1 else 0 end AS is_collect_3d2nd," + "case when type=3 and past_days>3 and past_days<=6 then 1 else 0 end AS is_cart_3d2nd," + "case when type=0 and past_days>6 and past_days<=9 then 1 else 0 end AS is_click_3d3th," + "case when type=1 and past_days>6 and past_days<=9 then 1 else 0 end AS is_buy_3d3th," + "case when type=2 and past_days>6 and past_days<=9 then 1 else 0 end AS is_collect_3d3th," + "case when type=3 and past_days>6 and past_days<=9 then 1 else 0 end AS is_cart_3d3th," + "case when type=0 and past_days<=7 then 1 else 0 end AS is_click_1w," + "case when type=1 and past_days<=7 then 1 else 0 end AS is_buy_1w," + "case when type=2 and past_days<=7 then 1 else 0 end AS is_collect_1w," + "case when type=3 and past_days<=7 then 1 else 0 end AS is_cart_1w," + "case when type=0 and past_days>7 and past_days<=14 then 1 else 0 end AS is_click_w2nd," + "case when type=1 and past_days>7 and past_days<=14 then 1 else 0 end AS is_buy_w2nd," + "case when type=2 and past_days>7 and past_days<=14 then 1 else 0 end AS is_collect_w2nd," + "case when type=3 and past_days>7 and past_days<=14 then 1 else 0 end AS is_cart_w2nd," + "case when type=0 and past_days>14 and past_days<=21 then 1 else 0 end AS is_click_w3th," + "case when type=1 and past_days>14 and past_days<=21 then 1 else 0 end AS is_buy_w3th," + "case when type=2 and past_days>14 and past_days<=21 then 1 else 0 end AS is_collect_w3th," + "case when type=3 and past_days>14 and past_days<=21 then 1 else 0 end AS is_cart_w3th";
    BatchOperator t1_preproc = t1.select("user_id, brand_id, type, ts, " + "TIMESTAMPDIFF(DAY, ts, TIMESTAMP '2014-07-16 00:00:00') AS past_days").select(clausePreProc);
    String clauseUserBrand = "user_id, brand_id, SUM(is_click) as cnt_click, SUM(is_buy) as cnt_buy, " + "SUM(is_collect) as cnt_collect, SUM(is_cart) as cnt_cart, " + "SUM(is_click_1m) as cnt_click_1m, SUM(is_buy_1m) as cnt_buy_1m, " + "SUM(is_collect_1m) as cnt_collect_1m, SUM(is_cart_1m) as cnt_cart_1m, " + "SUM(is_click_2m) as cnt_click_2m, SUM(is_buy_2m) as cnt_buy_2m, " + "SUM(is_collect_2m) as cnt_collect_2m, SUM(is_cart_2m) as cnt_cart_2m, " + "SUM(is_click_3m) as cnt_click_3m, SUM(is_buy_3m) as cnt_buy_3m, " + "SUM(is_collect_3m) as cnt_collect_3m, SUM(is_cart_3m) as cnt_cart_3m, " + "SUM(is_click_m2nd) as cnt_click_m2nd, SUM(is_buy_m2nd) as cnt_buy_m2nd, " + "SUM(is_collect_m2nd) as cnt_collect_m2nd, SUM(is_cart_m2nd) as cnt_cart_m2nd, " + "SUM(is_click_m3th) as cnt_click_m3th, SUM(is_buy_m3th) as cnt_buy_m3th, " + "SUM(is_collect_m3th) as cnt_collect_m3th, SUM(is_cart_m3th) as cnt_cart_m3th, " + "SUM(is_click_3d) as cnt_click_3d, SUM(is_buy_3d) as cnt_buy_3d, " + "SUM(is_collect_3d) as cnt_collect_3d, SUM(is_cart_3d) as cnt_cart_3d, " + "SUM(is_click_3d2nd) as cnt_click_3d2nd, SUM(is_buy_3d2nd) as cnt_buy_3d2nd, " + "SUM(is_collect_3d2nd) as cnt_collect_3d2nd, SUM(is_cart_3d2nd) as cnt_cart_3d2nd, " + "SUM(is_click_3d3th) as cnt_click_3d3th, SUM(is_buy_3d3th) as cnt_buy_3d3th, " + "SUM(is_collect_3d3th) as cnt_collect_3d3th, SUM(is_cart_3d3th) as cnt_cart_3d3th, " + "SUM(is_click_1w) as cnt_click_1w, SUM(is_buy_1w) as cnt_buy_1w, " + "SUM(is_collect_1w) as cnt_collect_1w, SUM(is_cart_1w) as cnt_cart_1w, " + "SUM(is_click_w2nd) as cnt_click_w2nd, SUM(is_buy_w2nd) as cnt_buy_w2nd, " + "SUM(is_collect_w2nd) as cnt_collect_w2nd, SUM(is_cart_w2nd) as cnt_cart_w2nd, " + "SUM(is_click_w3th) as cnt_click_w3th, SUM(is_buy_w3th) as cnt_buy_w3th, " + "SUM(is_collect_w3th) as cnt_collect_w3th, SUM(is_cart_w3th) as cnt_cart_w3th";
    BatchOperator t1_userbrand = t1_preproc.groupBy("user_id, brand_id", clauseUserBrand);
    String clauseUserBrand_Rate = "user_id,brand_id," + "cnt_click,cnt_buy,cnt_collect,cnt_cart," + "cnt_click_1m,cnt_buy_1m,cnt_collect_1m,cnt_cart_1m," + "cnt_click_2m,cnt_buy_2m,cnt_collect_2m,cnt_cart_2m," + "cnt_click_3m,cnt_buy_3m,cnt_collect_3m,cnt_cart_3m," + "cnt_click_m2nd,cnt_buy_m2nd,cnt_collect_m2nd,cnt_cart_m2nd," + "cnt_click_m3th,cnt_buy_m3th,cnt_collect_m3th,cnt_cart_m3th," + "cnt_click_3d,cnt_buy_3d,cnt_collect_3d,cnt_cart_3d," + "cnt_click_3d2nd,cnt_buy_3d2nd,cnt_collect_3d2nd,cnt_cart_3d2nd," + "cnt_click_3d3th,cnt_buy_3d3th,cnt_collect_3d3th,cnt_cart_3d3th," + "cnt_click_1w,cnt_buy_1w,cnt_collect_1w,cnt_cart_1w," + "cnt_click_w2nd,cnt_buy_w2nd,cnt_collect_w2nd,cnt_cart_w2nd," + "cnt_click_w3th,cnt_buy_w3th,cnt_collect_w3th,cnt_cart_w3th," + "case when cnt_buy>cnt_click then 1.0 when cnt_buy=0 then 0.0 else cnt_buy*1.0/cnt_click end AS " + "rt_click2buy," + "case when cnt_buy>cnt_collect then 1.0 when cnt_buy=0 then 0.0 else cnt_buy*1.0/cnt_collect end AS " + "rt_collect2buy," + "case when cnt_buy>cnt_cart then 1.0 when cnt_buy=0 then 0.0 else cnt_buy*1.0/cnt_cart end AS " + "rt_cart2buy," + "case when cnt_buy_3d>cnt_click_3d then 1.0 when cnt_buy_3d=0 then 0.0 else cnt_buy_3d*1.0/cnt_click_3d" + " end AS rt_click2buy_3d," + "case when cnt_buy_3d>cnt_collect_3d then 1.0 when cnt_buy_3d=0 then 0.0 else " + "cnt_buy_3d*1.0/cnt_collect_3d end AS rt_collect2buy_3d," + "case when cnt_buy_3d>cnt_cart_3d then 1.0 when cnt_buy_3d=0 then 0.0 else cnt_buy_3d*1.0/cnt_cart_3d " + "end AS rt_cart2buy_3d," + "case when cnt_buy_1w>cnt_click_1w then 1.0 when cnt_buy_1w=0 then 0.0 else cnt_buy_1w*1.0/cnt_click_1w" + " end AS rt_click2buy_1w," + "case when cnt_buy_1w>cnt_collect_1w then 1.0 when cnt_buy_1w=0 then 0.0 else " + "cnt_buy_1w*1.0/cnt_collect_1w end AS rt_collect2buy_1w," + "case when cnt_buy_1w>cnt_cart_1w then 1.0 when cnt_buy_1w=0 then 0.0 else cnt_buy_1w*1.0/cnt_cart_1w " + "end AS rt_cart2buy_1w," + "case when cnt_buy_1m>cnt_click_1m then 1.0 when cnt_buy_1m=0 then 0.0 else cnt_buy_1m*1.0/cnt_click_1m" + " end AS rt_click2buy_1m," + "case when cnt_buy_1m>cnt_collect_1m then 1.0 when cnt_buy_1m=0 then 0.0 else " + "cnt_buy_1m*1.0/cnt_collect_1m end AS rt_collect2buy_1m," + "case when cnt_buy_1m>cnt_cart_1m then 1.0 when cnt_buy_1m=0 then 0.0 else cnt_buy_1m*1.0/cnt_cart_1m " + "end AS rt_cart2buy_1m," + "case when cnt_click_3d=0 then 0.0 when cnt_click_3d>=120.0*cnt_click_3d2nd then 120.0 else " + "cnt_click_3d*1.0/cnt_click_3d2nd end AS rt_click_3d," + "case when cnt_buy_3d=0 then 0.0 when cnt_buy_3d>=10.0*cnt_buy_3d2nd then 10.0 else " + "cnt_buy_3d*1.0/cnt_buy_3d2nd end AS rt_buy_3d," + "case when cnt_collect_3d=0 then 0.0 when cnt_collect_3d>=10.0*cnt_collect_3d2nd then 10.0 else " + "cnt_collect_3d*1.0/cnt_collect_3d2nd end AS rt_collect_3d," + "case when cnt_cart_3d=0 then 0.0 when cnt_cart_3d>=20.0*cnt_cart_3d2nd then 20.0 else " + "cnt_cart_3d*1.0/cnt_cart_3d2nd end AS rt_cart_3d," + "case when cnt_click_1w=0 then 0.0 when cnt_click_1w>=300.0*cnt_click_w2nd then 300.0 else " + "cnt_click_1w*1.0/cnt_click_w2nd end AS rt_click_1w," + "case when cnt_buy_1w=0 then 0.0 when cnt_buy_1w>=15.0*cnt_buy_w2nd then 15.0 else " + "cnt_buy_1w*1.0/cnt_buy_w2nd end AS rt_buy_1w," + "case when cnt_collect_1w=0 then 0.0 when cnt_collect_1w>=15.0*cnt_collect_w2nd then 15.0 else " + "cnt_collect_1w*1.0/cnt_collect_w2nd end AS rt_collect_1w," + "case when cnt_cart_1w=0 then 0.0 when cnt_cart_1w>=40.0*cnt_cart_w2nd then 40.0 else " + "cnt_cart_1w*1.0/cnt_cart_w2nd end AS rt_cart_1w," + "case when cnt_click_1m=0 then 0.0 when cnt_click_1m>=500.0*cnt_click_m2nd then 500.0 else " + "cnt_click_1m*1.0/cnt_click_m2nd end AS rt_click_1m," + "case when cnt_buy_1m=0 then 0.0 when cnt_buy_1m>=30.0*cnt_buy_m2nd then 30.0 else " + "cnt_buy_1m*1.0/cnt_buy_m2nd end AS rt_buy_1m," + "case when cnt_collect_1m=0 then 0.0 when cnt_collect_1m>=30.0*cnt_collect_m2nd then 30.0 else " + "cnt_collect_1m*1.0/cnt_collect_m2nd end AS rt_collect_1m," + "case when cnt_cart_1m=0 then 0.0 when cnt_cart_1m>=50.0*cnt_cart_m2nd then 50.0 else " + "cnt_cart_1m*1.0/cnt_cart_m2nd end AS rt_cart_1m";
    t1_userbrand = t1_userbrand.select(clauseUserBrand_Rate);
    String clauseUser = "user_id, " + "SUM(is_click) as user_cnt_click, SUM(is_buy) as user_cnt_buy, " + "SUM(is_collect) as user_cnt_collect, SUM(is_cart) as user_cnt_cart, " + "SUM(is_click_1m) as user_cnt_click_1m, SUM(is_buy_1m) as user_cnt_buy_1m, " + "SUM(is_collect_1m) as user_cnt_collect_1m, SUM(is_cart_1m) as user_cnt_cart_1m, " + "SUM(is_click_2m) as user_cnt_click_2m, SUM(is_buy_2m) as user_cnt_buy_2m, " + "SUM(is_collect_2m) as user_cnt_collect_2m, SUM(is_cart_2m) as user_cnt_cart_2m, " + "SUM(is_click_3m) as user_cnt_click_3m, SUM(is_buy_3m) as user_cnt_buy_3m, " + "SUM(is_collect_3m) as user_cnt_collect_3m, SUM(is_cart_3m) as user_cnt_cart_3m, " + "SUM(is_click_m2nd) as user_cnt_click_m2nd, SUM(is_buy_m2nd) as user_cnt_buy_m2nd, " + "SUM(is_collect_m2nd) as user_cnt_collect_m2nd, SUM(is_cart_m2nd) as user_cnt_cart_m2nd, " + "SUM(is_click_m3th) as user_cnt_click_m3th, SUM(is_buy_m3th) as user_cnt_buy_m3th, " + "SUM(is_collect_m3th) as user_cnt_collect_m3th, SUM(is_cart_m3th) as user_cnt_cart_m3th, " + "SUM(is_click_3d) as user_cnt_click_3d, SUM(is_buy_3d) as user_cnt_buy_3d, " + "SUM(is_collect_3d) as user_cnt_collect_3d, SUM(is_cart_3d) as user_cnt_cart_3d, " + "SUM(is_click_3d2nd) as user_cnt_click_3d2nd, SUM(is_buy_3d2nd) as user_cnt_buy_3d2nd, " + "SUM(is_collect_3d2nd) as user_cnt_collect_3d2nd, SUM(is_cart_3d2nd) as user_cnt_cart_3d2nd, " + "SUM(is_click_3d3th) as user_cnt_click_3d3th, SUM(is_buy_3d3th) as user_cnt_buy_3d3th, " + "SUM(is_collect_3d3th) as user_cnt_collect_3d3th, SUM(is_cart_3d3th) as user_cnt_cart_3d3th, " + "SUM(is_click_1w) as user_cnt_click_1w, SUM(is_buy_1w) as user_cnt_buy_1w, " + "SUM(is_collect_1w) as user_cnt_collect_1w, SUM(is_cart_1w) as user_cnt_cart_1w, " + "SUM(is_click_w2nd) as user_cnt_click_w2nd, SUM(is_buy_w2nd) as user_cnt_buy_w2nd, " + "SUM(is_collect_w2nd) as user_cnt_collect_w2nd, SUM(is_cart_w2nd) as user_cnt_cart_w2nd, " + "SUM(is_click_w3th) as user_cnt_click_w3th, SUM(is_buy_w3th) as user_cnt_buy_w3th, " + "SUM(is_collect_w3th) as user_cnt_collect_w3th, SUM(is_cart_w3th) as user_cnt_cart_w3th";
    BatchOperator t1_user = t1_preproc.groupBy("user_id", clauseUser);
    String clauseUser_Rate = "user_id AS user_id4join," + "user_cnt_click,user_cnt_buy,user_cnt_collect,user_cnt_cart," + "user_cnt_click_1m,user_cnt_buy_1m,user_cnt_collect_1m,user_cnt_cart_1m," + "user_cnt_click_2m,user_cnt_buy_2m,user_cnt_collect_2m,user_cnt_cart_2m," + "user_cnt_click_3m,user_cnt_buy_3m,user_cnt_collect_3m,user_cnt_cart_3m," + "user_cnt_click_m2nd,user_cnt_buy_m2nd,user_cnt_collect_m2nd,user_cnt_cart_m2nd," + "user_cnt_click_m3th,user_cnt_buy_m3th,user_cnt_collect_m3th,user_cnt_cart_m3th," + "user_cnt_click_3d,user_cnt_buy_3d,user_cnt_collect_3d,user_cnt_cart_3d," + "user_cnt_click_3d2nd,user_cnt_buy_3d2nd,user_cnt_collect_3d2nd,user_cnt_cart_3d2nd," + "user_cnt_click_3d3th,user_cnt_buy_3d3th,user_cnt_collect_3d3th,user_cnt_cart_3d3th," + "user_cnt_click_1w,user_cnt_buy_1w,user_cnt_collect_1w,user_cnt_cart_1w," + "user_cnt_click_w2nd,user_cnt_buy_w2nd,user_cnt_collect_w2nd,user_cnt_cart_w2nd," + "user_cnt_click_w3th,user_cnt_buy_w3th,user_cnt_collect_w3th,user_cnt_cart_w3th," + "case when user_cnt_buy>user_cnt_click then 1.0 when user_cnt_buy=0 then 0.0 else " + "user_cnt_buy*1.0/user_cnt_click end AS user_rt_click2buy," + "case when user_cnt_buy>user_cnt_collect then 1.0 when user_cnt_buy=0 then 0.0 else " + "user_cnt_buy*1.0/user_cnt_collect end AS user_rt_collect2buy," + "case when user_cnt_buy>user_cnt_cart then 1.0 when user_cnt_buy=0 then 0.0 else " + "user_cnt_buy*1.0/user_cnt_cart end AS user_rt_cart2buy," + "case when user_cnt_buy_3d>user_cnt_click_3d then 1.0 when user_cnt_buy_3d=0 then 0.0 else " + "user_cnt_buy_3d*1.0/user_cnt_click_3d end AS user_rt_click2buy_3d," + "case when user_cnt_buy_3d>user_cnt_collect_3d then 1.0 when user_cnt_buy_3d=0 then 0.0 else " + "user_cnt_buy_3d*1.0/user_cnt_collect_3d end AS user_rt_collect2buy_3d," + "case when user_cnt_buy_3d>user_cnt_cart_3d then 1.0 when user_cnt_buy_3d=0 then 0.0 else " + "user_cnt_buy_3d*1.0/user_cnt_cart_3d end AS user_rt_cart2buy_3d," + "case when user_cnt_buy_1w>user_cnt_click_1w then 1.0 when user_cnt_buy_1w=0 then 0.0 else " + "user_cnt_buy_1w*1.0/user_cnt_click_1w end AS user_rt_click2buy_1w," + "case when user_cnt_buy_1w>user_cnt_collect_1w then 1.0 when user_cnt_buy_1w=0 then 0.0 else " + "user_cnt_buy_1w*1.0/user_cnt_collect_1w end AS user_rt_collect2buy_1w," + "case when user_cnt_buy_1w>user_cnt_cart_1w then 1.0 when user_cnt_buy_1w=0 then 0.0 else " + "user_cnt_buy_1w*1.0/user_cnt_cart_1w end AS user_rt_cart2buy_1w," + "case when user_cnt_buy_1m>user_cnt_click_1m then 1.0 when user_cnt_buy_1m=0 then 0.0 else " + "user_cnt_buy_1m*1.0/user_cnt_click_1m end AS user_rt_click2buy_1m," + "case when user_cnt_buy_1m>user_cnt_collect_1m then 1.0 when user_cnt_buy_1m=0 then 0.0 else " + "user_cnt_buy_1m*1.0/user_cnt_collect_1m end AS user_rt_collect2buy_1m," + "case when user_cnt_buy_1m>user_cnt_cart_1m then 1.0 when user_cnt_buy_1m=0 then 0.0 else " + "user_cnt_buy_1m*1.0/user_cnt_cart_1m end AS user_rt_cart2buy_1m";
    t1_user = t1_user.select(clauseUser_Rate);
    String clauseBrand = "brand_id, " + "SUM(is_click) as brand_cnt_click, SUM(is_buy) as brand_cnt_buy, " + "SUM(is_collect) as brand_cnt_collect, SUM(is_cart) as brand_cnt_cart, " + "SUM(is_click_1m) as brand_cnt_click_1m, SUM(is_buy_1m) as brand_cnt_buy_1m, " + "SUM(is_collect_1m) as brand_cnt_collect_1m, SUM(is_cart_1m) as brand_cnt_cart_1m, " + "SUM(is_click_2m) as brand_cnt_click_2m, SUM(is_buy_2m) as brand_cnt_buy_2m, " + "SUM(is_collect_2m) as brand_cnt_collect_2m, SUM(is_cart_2m) as brand_cnt_cart_2m, " + "SUM(is_click_3m) as brand_cnt_click_3m, SUM(is_buy_3m) as brand_cnt_buy_3m, " + "SUM(is_collect_3m) as brand_cnt_collect_3m, SUM(is_cart_3m) as brand_cnt_cart_3m, " + "SUM(is_click_m2nd) as brand_cnt_click_m2nd, SUM(is_buy_m2nd) as brand_cnt_buy_m2nd, " + "SUM(is_collect_m2nd) as brand_cnt_collect_m2nd, SUM(is_cart_m2nd) as brand_cnt_cart_m2nd, " + "SUM(is_click_m3th) as brand_cnt_click_m3th, SUM(is_buy_m3th) as brand_cnt_buy_m3th, " + "SUM(is_collect_m3th) as brand_cnt_collect_m3th, SUM(is_cart_m3th) as brand_cnt_cart_m3th, " + "SUM(is_click_3d) as brand_cnt_click_3d, SUM(is_buy_3d) as brand_cnt_buy_3d, " + "SUM(is_collect_3d) as brand_cnt_collect_3d, SUM(is_cart_3d) as brand_cnt_cart_3d, " + "SUM(is_click_3d2nd) as brand_cnt_click_3d2nd, SUM(is_buy_3d2nd) as brand_cnt_buy_3d2nd, " + "SUM(is_collect_3d2nd) as brand_cnt_collect_3d2nd, SUM(is_cart_3d2nd) as brand_cnt_cart_3d2nd, " + "SUM(is_click_3d3th) as brand_cnt_click_3d3th, SUM(is_buy_3d3th) as brand_cnt_buy_3d3th, " + "SUM(is_collect_3d3th) as brand_cnt_collect_3d3th, SUM(is_cart_3d3th) as brand_cnt_cart_3d3th, " + "SUM(is_click_1w) as brand_cnt_click_1w, SUM(is_buy_1w) as brand_cnt_buy_1w, " + "SUM(is_collect_1w) as brand_cnt_collect_1w, SUM(is_cart_1w) as brand_cnt_cart_1w, " + "SUM(is_click_w2nd) as brand_cnt_click_w2nd, SUM(is_buy_w2nd) as brand_cnt_buy_w2nd, " + "SUM(is_collect_w2nd) as brand_cnt_collect_w2nd, SUM(is_cart_w2nd) as brand_cnt_cart_w2nd, " + "SUM(is_click_w3th) as brand_cnt_click_w3th, SUM(is_buy_w3th) as brand_cnt_buy_w3th, " + "SUM(is_collect_w3th) as brand_cnt_collect_w3th, SUM(is_cart_w3th) as brand_cnt_cart_w3th";
    BatchOperator t1_brand = t1_preproc.groupBy("brand_id", clauseBrand);
    String clauseBrand_Rate = "brand_id AS brand_id4join," + "brand_cnt_click,brand_cnt_buy,brand_cnt_collect,brand_cnt_cart," + "brand_cnt_click_1m,brand_cnt_buy_1m,brand_cnt_collect_1m,brand_cnt_cart_1m," + "brand_cnt_click_2m,brand_cnt_buy_2m,brand_cnt_collect_2m,brand_cnt_cart_2m," + "brand_cnt_click_3m,brand_cnt_buy_3m,brand_cnt_collect_3m,brand_cnt_cart_3m," + "brand_cnt_click_m2nd,brand_cnt_buy_m2nd,brand_cnt_collect_m2nd,brand_cnt_cart_m2nd," + "brand_cnt_click_m3th,brand_cnt_buy_m3th,brand_cnt_collect_m3th,brand_cnt_cart_m3th," + "brand_cnt_click_3d,brand_cnt_buy_3d,brand_cnt_collect_3d,brand_cnt_cart_3d," + "brand_cnt_click_3d2nd,brand_cnt_buy_3d2nd,brand_cnt_collect_3d2nd,brand_cnt_cart_3d2nd," + "brand_cnt_click_3d3th,brand_cnt_buy_3d3th,brand_cnt_collect_3d3th,brand_cnt_cart_3d3th," + "brand_cnt_click_1w,brand_cnt_buy_1w,brand_cnt_collect_1w,brand_cnt_cart_1w," + "brand_cnt_click_w2nd,brand_cnt_buy_w2nd,brand_cnt_collect_w2nd,brand_cnt_cart_w2nd," + "brand_cnt_click_w3th,brand_cnt_buy_w3th,brand_cnt_collect_w3th,brand_cnt_cart_w3th," + "case when brand_cnt_buy>brand_cnt_click then 1.0 when brand_cnt_buy=0 then 0.0 else " + "brand_cnt_buy*1.0/brand_cnt_click end AS brand_rt_click2buy," + "case when brand_cnt_buy>brand_cnt_collect then 1.0 when brand_cnt_buy=0 then 0.0 else " + "brand_cnt_buy*1.0/brand_cnt_collect end AS brand_rt_collect2buy," + "case when brand_cnt_buy>brand_cnt_cart then 1.0 when brand_cnt_buy=0 then 0.0 else " + "brand_cnt_buy*1.0/brand_cnt_cart end AS brand_rt_cart2buy," + "case when brand_cnt_buy_3d>brand_cnt_click_3d then 1.0 when brand_cnt_buy_3d=0 then 0.0 else " + "brand_cnt_buy_3d*1.0/brand_cnt_click_3d end AS brand_rt_click2buy_3d," + "case when brand_cnt_buy_3d>brand_cnt_collect_3d then 1.0 when brand_cnt_buy_3d=0 then 0.0 else " + "brand_cnt_buy_3d*1.0/brand_cnt_collect_3d end AS brand_rt_collect2buy_3d," + "case when brand_cnt_buy_3d>brand_cnt_cart_3d then 1.0 when brand_cnt_buy_3d=0 then 0.0 else " + "brand_cnt_buy_3d*1.0/brand_cnt_cart_3d end AS brand_rt_cart2buy_3d," + "case when brand_cnt_buy_1w>brand_cnt_click_1w then 1.0 when brand_cnt_buy_1w=0 then 0.0 else " + "brand_cnt_buy_1w*1.0/brand_cnt_click_1w end AS brand_rt_click2buy_1w," + "case when brand_cnt_buy_1w>brand_cnt_collect_1w then 1.0 when brand_cnt_buy_1w=0 then 0.0 else " + "brand_cnt_buy_1w*1.0/brand_cnt_collect_1w end AS brand_rt_collect2buy_1w," + "case when brand_cnt_buy_1w>brand_cnt_cart_1w then 1.0 when brand_cnt_buy_1w=0 then 0.0 else " + "brand_cnt_buy_1w*1.0/brand_cnt_cart_1w end AS brand_cart2buy_1w," + "case when brand_cnt_buy_1m>brand_cnt_click_1m then 1.0 when brand_cnt_buy_1m=0 then 0.0 else " + "brand_cnt_buy_1m*1.0/brand_cnt_click_1m end AS brand_rt_click2buy_1m," + "case when brand_cnt_buy_1m>brand_cnt_collect_1m then 1.0 when brand_cnt_buy_1m=0 then 0.0 else " + "brand_cnt_buy_1m*1.0/brand_cnt_collect_1m end AS brand_rt_collect2buy_1m," + "case when brand_cnt_buy_1m>brand_cnt_cart_1m then 1.0 when brand_cnt_buy_1m=0 then 0.0 else " + "brand_cnt_buy_1m*1.0/brand_cnt_cart_1m end AS brand_rt_cart2buy_1m";
    t1_brand = t1_brand.select(clauseBrand_Rate);
    BatchOperator t1_join = new JoinBatchOp().setSelectClause("*").setJoinPredicate("user_id=user_id4join").linkFrom(t1_userbrand, t1_user);
    t1_join = new JoinBatchOp().setSelectClause("*").setJoinPredicate("brand_id=brand_id4join").linkFrom(t1_join, t1_brand);
    BatchOperator t2_label = t2.filter("type=1").select("user_id AS user_id4label, brand_id AS brand_id4label, 1 as label").distinct();
    BatchOperator feature_label = new LeftOuterJoinBatchOp().setSelectClause("*").setJoinPredicate("user_id = user_id4label AND brand_id = brand_id4label").linkFrom(t1_join, t2_label);
    Imputer imputer = new Imputer().setStrategy("value").setFillValue("0").setSelectedCols("label");
    feature_label = imputer.fit(feature_label).transform(feature_label);
    System.out.println(feature_label.getSchema());
    String[] featureColNames = ArrayUtils.removeElements(feature_label.getColNames(), new String[] { "user_id", "brand_id", "user_id4join", "brand_id4join", "user_id4label", "brand_id4label", LABEL_COL_NAME });
    StringBuilder sbd = new StringBuilder();
    for (String name : featureColNames) {
        sbd.append("CAST(").append(name).append(" AS DOUBLE) AS ").append(name).append(", ");
    }
    sbd.append(LABEL_COL_NAME);
    feature_label.select(sbd.toString()).link(new AkSinkBatchOp().setFilePath(DATA_DIR + FEATURE_LABEL_FILE).setOverwriteSink(true));
    BatchOperator.execute();
}
Also used : LeftOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp) Imputer(com.alibaba.alink.pipeline.dataproc.Imputer) LeftOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp) JoinBatchOp(com.alibaba.alink.operator.batch.sql.JoinBatchOp) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator)

Example 3 with Imputer

use of com.alibaba.alink.pipeline.dataproc.Imputer in project Alink by alibaba.

the class ImputerOpTest method testModelInfo.

@Test
public void testModelInfo() throws Exception {
    BatchOperator batchData = (BatchOperator) com.alibaba.alink.pipeline.dataproc.ImputerTest.getData(true);
    ImputerModel fillVal = new Imputer().setSelectedCols("f_string").setOutputCols("f_string_2").setStrategy("value").setFillValue("w").enableLazyPrintModelInfo().fit(batchData);
    fillVal.transform(batchData).lazyCollect();
    ImputerModel mean = new Imputer().enableLazyPrintModelInfo().setSelectedCols("f_double", "f_int").setOutputCols("f_double_2", "f_int_2").setStrategy("mean").fit(batchData);
    mean.transform(batchData).lazyCollect();
    ImputerModel min = new Imputer().setSelectedCols("f_double").enableLazyPrintModelInfo().setStrategy("min").fit(batchData);
    min.transform(batchData).lazyCollect();
    ImputerTrainBatchOp op = new ImputerTrainBatchOp().setSelectedCols("f_double").setStrategy("max").lazyPrintModelInfo().linkFrom(batchData);
    new ImputerPredictBatchOp().linkFrom(op, batchData).collect();
}
Also used : Imputer(com.alibaba.alink.pipeline.dataproc.Imputer) ImputerModel(com.alibaba.alink.pipeline.dataproc.ImputerModel) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) Test(org.junit.Test)

Example 4 with Imputer

use of com.alibaba.alink.pipeline.dataproc.Imputer in project Alink by alibaba.

the class ImputerOpTest method test.

@Test
public void test() throws Exception {
    BatchOperator batchData = (BatchOperator) com.alibaba.alink.pipeline.dataproc.ImputerTest.getData(true);
    StreamOperator streamData = (StreamOperator) com.alibaba.alink.pipeline.dataproc.ImputerTest.getData(false);
    ImputerModel fillVal = new Imputer().setSelectedCols("f_string").setOutputCols("f_string_2").setStrategy("value").setFillValue("w").fit(batchData);
    fillVal.transform(batchData).lazyCollect();
    fillVal.transform(streamData).print();
    ImputerModel mean = new Imputer().setSelectedCols("f_double", "f_int").setOutputCols("f_double_2", "f_int_2").setStrategy("mean").fit(batchData);
    mean.transform(batchData).lazyCollect();
    mean.transform(streamData).print();
    ImputerModel min = new Imputer().setSelectedCols("f_double").setStrategy("min").fit(batchData);
    min.transform(batchData).lazyCollect();
    min.transform(streamData).print();
    ImputerTrainBatchOp op = new ImputerTrainBatchOp().setSelectedCols("f_double").setStrategy("max").linkFrom(batchData);
    new ImputerPredictBatchOp().linkFrom(op, batchData).collect();
    new ImputerPredictStreamOp(op).linkFrom(streamData).print();
    StreamOperator.execute();
}
Also used : Imputer(com.alibaba.alink.pipeline.dataproc.Imputer) ImputerModel(com.alibaba.alink.pipeline.dataproc.ImputerModel) ImputerPredictStreamOp(com.alibaba.alink.operator.stream.dataproc.ImputerPredictStreamOp) StreamOperator(com.alibaba.alink.operator.stream.StreamOperator) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator) Test(org.junit.Test)

Aggregations

Imputer (com.alibaba.alink.pipeline.dataproc.Imputer)4 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)3 ImputerModel (com.alibaba.alink.pipeline.dataproc.ImputerModel)2 Test (org.junit.Test)2 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)1 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)1 JoinBatchOp (com.alibaba.alink.operator.batch.sql.JoinBatchOp)1 LeftOuterJoinBatchOp (com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp)1 StreamOperator (com.alibaba.alink.operator.stream.StreamOperator)1 ImputerPredictStreamOp (com.alibaba.alink.operator.stream.dataproc.ImputerPredictStreamOp)1 Pipeline (com.alibaba.alink.pipeline.Pipeline)1 Row (org.apache.flink.types.Row)1