Search in sources :

Example 1 with LeftOuterJoinBatchOp

use of com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp in project Alink by alibaba.

the class Chap05 method c_2_2.

static void c_2_2() throws Exception {
    BatchOperator<?> ratings = Chap24.getSourceRatings();
    BatchOperator<?> items = Chap24.getSourceItems();
    BatchOperator left_ratings = ratings.filter("user_id<3 AND item_id<4").select("user_id, item_id, rating");
    BatchOperator right_movies = items.select("item_id AS movie_id, title").filter("movie_id < 6 AND MOD(movie_id, 2) = 1");
    System.out.println("# left_ratings #");
    left_ratings.print();
    System.out.println("\n# right_movies #");
    right_movies.print();
    System.out.println("# JOIN #");
    new JoinBatchOp().setJoinPredicate("item_id = movie_id").setSelectClause("user_id, item_id, title, rating").linkFrom(left_ratings, right_movies).print();
    System.out.println("\n# LEFT OUTER JOIN #");
    new LeftOuterJoinBatchOp().setJoinPredicate("item_id = movie_id").setSelectClause("user_id, item_id, title, rating").linkFrom(left_ratings, right_movies).print();
    System.out.println("\n# RIGHT OUTER JOIN #");
    new RightOuterJoinBatchOp().setJoinPredicate("item_id = movie_id").setSelectClause("user_id, item_id, title, rating").linkFrom(left_ratings, right_movies).print();
    System.out.println("\n# FULL OUTER JOIN #");
    new FullOuterJoinBatchOp().setJoinPredicate("item_id = movie_id").setSelectClause("user_id, item_id, title, rating").linkFrom(left_ratings, right_movies).print();
}
Also used : LeftOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp) FullOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.FullOuterJoinBatchOp) LeftOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp) JoinBatchOp(com.alibaba.alink.operator.batch.sql.JoinBatchOp) FullOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.FullOuterJoinBatchOp) RightOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.RightOuterJoinBatchOp) RightOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.RightOuterJoinBatchOp) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator)

Example 2 with LeftOuterJoinBatchOp

use of com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp in project Alink by alibaba.

the class Chap11 method c_3.

static void c_3() throws Exception {
    BatchOperator<?> source = getSource();
    BatchOperator t1 = source.filter("ts < CAST('2014-07-16 00:00:00' AS TIMESTAMP)");
    BatchOperator t2 = source.filter("ts >= CAST('2014-07-16 00:00:00' AS TIMESTAMP)");
    t1.lazyPrint(3, "[ ts < '2014-07-16 00:00:00' ]").lazyPrintStatistics();
    t2.lazyPrint(3, "[ ts >= '2014-07-16 00:00:00' ]").lazyPrintStatistics();
    BatchOperator.execute();
    String clausePreProc = "user_id, brand_id, type, ts, past_days," + "case when type=0 then 1 else 0 end AS is_click," + "case when type=1 then 1 else 0 end AS is_buy," + "case when type=2 then 1 else 0 end AS is_collect," + "case when type=3 then 1 else 0 end AS is_cart," + "case when type=0 and past_days<=30 then 1 else 0 end AS is_click_1m," + "case when type=1 and past_days<=30 then 1 else 0 end AS is_buy_1m," + "case when type=2 and past_days<=30 then 1 else 0 end AS is_collect_1m," + "case when type=3 and past_days<=30 then 1 else 0 end AS is_cart_1m," + "case when type=0 and past_days<=60 then 1 else 0 end AS is_click_2m," + "case when type=1 and past_days<=60 then 1 else 0 end AS is_buy_2m," + "case when type=2 and past_days<=60 then 1 else 0 end AS is_collect_2m," + "case when type=3 and past_days<=60 then 1 else 0 end AS is_cart_2m," + "case when type=0 and past_days<=90 then 1 else 0 end AS is_click_3m," + "case when type=1 and past_days<=90 then 1 else 0 end AS is_buy_3m," + "case when type=2 and past_days<=90 then 1 else 0 end AS is_collect_3m," + "case when type=3 and past_days<=90 then 1 else 0 end AS is_cart_3m," + "case when type=0 and past_days>30 and past_days<=60 then 1 else 0 end AS is_click_m2nd," + "case when type=1 and past_days>30 and past_days<=60 then 1 else 0 end AS is_buy_m2nd," + "case when type=2 and past_days>30 and past_days<=60 then 1 else 0 end AS is_collect_m2nd," + "case when type=3 and past_days>30 and past_days<=60 then 1 else 0 end AS is_cart_m2nd," + "case when type=0 and past_days>60 and past_days<=90 then 1 else 0 end AS is_click_m3th," + "case when type=1 and past_days>60 and past_days<=90 then 1 else 0 end AS is_buy_m3th," + "case when type=2 and past_days>60 and past_days<=90 then 1 else 0 end AS is_collect_m3th," + "case when type=3 and past_days>60 and past_days<=90 then 1 else 0 end AS is_cart_m3th," + "case when type=0 and past_days<=3 then 1 else 0 end AS is_click_3d," + "case when type=1 and past_days<=3 then 1 else 0 end AS is_buy_3d," + "case when type=2 and past_days<=3 then 1 else 0 end AS is_collect_3d," + "case when type=3 and past_days<=3 then 1 else 0 end AS is_cart_3d," + "case when type=0 and past_days>3 and past_days<=6 then 1 else 0 end AS is_click_3d2nd," + "case when type=1 and past_days>3 and past_days<=6 then 1 else 0 end AS is_buy_3d2nd," + "case when type=2 and past_days>3 and past_days<=6 then 1 else 0 end AS is_collect_3d2nd," + "case when type=3 and past_days>3 and past_days<=6 then 1 else 0 end AS is_cart_3d2nd," + "case when type=0 and past_days>6 and past_days<=9 then 1 else 0 end AS is_click_3d3th," + "case when type=1 and past_days>6 and past_days<=9 then 1 else 0 end AS is_buy_3d3th," + "case when type=2 and past_days>6 and past_days<=9 then 1 else 0 end AS is_collect_3d3th," + "case when type=3 and past_days>6 and past_days<=9 then 1 else 0 end AS is_cart_3d3th," + "case when type=0 and past_days<=7 then 1 else 0 end AS is_click_1w," + "case when type=1 and past_days<=7 then 1 else 0 end AS is_buy_1w," + "case when type=2 and past_days<=7 then 1 else 0 end AS is_collect_1w," + "case when type=3 and past_days<=7 then 1 else 0 end AS is_cart_1w," + "case when type=0 and past_days>7 and past_days<=14 then 1 else 0 end AS is_click_w2nd," + "case when type=1 and past_days>7 and past_days<=14 then 1 else 0 end AS is_buy_w2nd," + "case when type=2 and past_days>7 and past_days<=14 then 1 else 0 end AS is_collect_w2nd," + "case when type=3 and past_days>7 and past_days<=14 then 1 else 0 end AS is_cart_w2nd," + "case when type=0 and past_days>14 and past_days<=21 then 1 else 0 end AS is_click_w3th," + "case when type=1 and past_days>14 and past_days<=21 then 1 else 0 end AS is_buy_w3th," + "case when type=2 and past_days>14 and past_days<=21 then 1 else 0 end AS is_collect_w3th," + "case when type=3 and past_days>14 and past_days<=21 then 1 else 0 end AS is_cart_w3th";
    BatchOperator t1_preproc = t1.select("user_id, brand_id, type, ts, " + "TIMESTAMPDIFF(DAY, ts, TIMESTAMP '2014-07-16 00:00:00') AS past_days").select(clausePreProc);
    String clauseUserBrand = "user_id, brand_id, SUM(is_click) as cnt_click, SUM(is_buy) as cnt_buy, " + "SUM(is_collect) as cnt_collect, SUM(is_cart) as cnt_cart, " + "SUM(is_click_1m) as cnt_click_1m, SUM(is_buy_1m) as cnt_buy_1m, " + "SUM(is_collect_1m) as cnt_collect_1m, SUM(is_cart_1m) as cnt_cart_1m, " + "SUM(is_click_2m) as cnt_click_2m, SUM(is_buy_2m) as cnt_buy_2m, " + "SUM(is_collect_2m) as cnt_collect_2m, SUM(is_cart_2m) as cnt_cart_2m, " + "SUM(is_click_3m) as cnt_click_3m, SUM(is_buy_3m) as cnt_buy_3m, " + "SUM(is_collect_3m) as cnt_collect_3m, SUM(is_cart_3m) as cnt_cart_3m, " + "SUM(is_click_m2nd) as cnt_click_m2nd, SUM(is_buy_m2nd) as cnt_buy_m2nd, " + "SUM(is_collect_m2nd) as cnt_collect_m2nd, SUM(is_cart_m2nd) as cnt_cart_m2nd, " + "SUM(is_click_m3th) as cnt_click_m3th, SUM(is_buy_m3th) as cnt_buy_m3th, " + "SUM(is_collect_m3th) as cnt_collect_m3th, SUM(is_cart_m3th) as cnt_cart_m3th, " + "SUM(is_click_3d) as cnt_click_3d, SUM(is_buy_3d) as cnt_buy_3d, " + "SUM(is_collect_3d) as cnt_collect_3d, SUM(is_cart_3d) as cnt_cart_3d, " + "SUM(is_click_3d2nd) as cnt_click_3d2nd, SUM(is_buy_3d2nd) as cnt_buy_3d2nd, " + "SUM(is_collect_3d2nd) as cnt_collect_3d2nd, SUM(is_cart_3d2nd) as cnt_cart_3d2nd, " + "SUM(is_click_3d3th) as cnt_click_3d3th, SUM(is_buy_3d3th) as cnt_buy_3d3th, " + "SUM(is_collect_3d3th) as cnt_collect_3d3th, SUM(is_cart_3d3th) as cnt_cart_3d3th, " + "SUM(is_click_1w) as cnt_click_1w, SUM(is_buy_1w) as cnt_buy_1w, " + "SUM(is_collect_1w) as cnt_collect_1w, SUM(is_cart_1w) as cnt_cart_1w, " + "SUM(is_click_w2nd) as cnt_click_w2nd, SUM(is_buy_w2nd) as cnt_buy_w2nd, " + "SUM(is_collect_w2nd) as cnt_collect_w2nd, SUM(is_cart_w2nd) as cnt_cart_w2nd, " + "SUM(is_click_w3th) as cnt_click_w3th, SUM(is_buy_w3th) as cnt_buy_w3th, " + "SUM(is_collect_w3th) as cnt_collect_w3th, SUM(is_cart_w3th) as cnt_cart_w3th";
    BatchOperator t1_userbrand = t1_preproc.groupBy("user_id, brand_id", clauseUserBrand);
    String clauseUserBrand_Rate = "user_id,brand_id," + "cnt_click,cnt_buy,cnt_collect,cnt_cart," + "cnt_click_1m,cnt_buy_1m,cnt_collect_1m,cnt_cart_1m," + "cnt_click_2m,cnt_buy_2m,cnt_collect_2m,cnt_cart_2m," + "cnt_click_3m,cnt_buy_3m,cnt_collect_3m,cnt_cart_3m," + "cnt_click_m2nd,cnt_buy_m2nd,cnt_collect_m2nd,cnt_cart_m2nd," + "cnt_click_m3th,cnt_buy_m3th,cnt_collect_m3th,cnt_cart_m3th," + "cnt_click_3d,cnt_buy_3d,cnt_collect_3d,cnt_cart_3d," + "cnt_click_3d2nd,cnt_buy_3d2nd,cnt_collect_3d2nd,cnt_cart_3d2nd," + "cnt_click_3d3th,cnt_buy_3d3th,cnt_collect_3d3th,cnt_cart_3d3th," + "cnt_click_1w,cnt_buy_1w,cnt_collect_1w,cnt_cart_1w," + "cnt_click_w2nd,cnt_buy_w2nd,cnt_collect_w2nd,cnt_cart_w2nd," + "cnt_click_w3th,cnt_buy_w3th,cnt_collect_w3th,cnt_cart_w3th," + "case when cnt_buy>cnt_click then 1.0 when cnt_buy=0 then 0.0 else cnt_buy*1.0/cnt_click end AS " + "rt_click2buy," + "case when cnt_buy>cnt_collect then 1.0 when cnt_buy=0 then 0.0 else cnt_buy*1.0/cnt_collect end AS " + "rt_collect2buy," + "case when cnt_buy>cnt_cart then 1.0 when cnt_buy=0 then 0.0 else cnt_buy*1.0/cnt_cart end AS " + "rt_cart2buy," + "case when cnt_buy_3d>cnt_click_3d then 1.0 when cnt_buy_3d=0 then 0.0 else cnt_buy_3d*1.0/cnt_click_3d" + " end AS rt_click2buy_3d," + "case when cnt_buy_3d>cnt_collect_3d then 1.0 when cnt_buy_3d=0 then 0.0 else " + "cnt_buy_3d*1.0/cnt_collect_3d end AS rt_collect2buy_3d," + "case when cnt_buy_3d>cnt_cart_3d then 1.0 when cnt_buy_3d=0 then 0.0 else cnt_buy_3d*1.0/cnt_cart_3d " + "end AS rt_cart2buy_3d," + "case when cnt_buy_1w>cnt_click_1w then 1.0 when cnt_buy_1w=0 then 0.0 else cnt_buy_1w*1.0/cnt_click_1w" + " end AS rt_click2buy_1w," + "case when cnt_buy_1w>cnt_collect_1w then 1.0 when cnt_buy_1w=0 then 0.0 else " + "cnt_buy_1w*1.0/cnt_collect_1w end AS rt_collect2buy_1w," + "case when cnt_buy_1w>cnt_cart_1w then 1.0 when cnt_buy_1w=0 then 0.0 else cnt_buy_1w*1.0/cnt_cart_1w " + "end AS rt_cart2buy_1w," + "case when cnt_buy_1m>cnt_click_1m then 1.0 when cnt_buy_1m=0 then 0.0 else cnt_buy_1m*1.0/cnt_click_1m" + " end AS rt_click2buy_1m," + "case when cnt_buy_1m>cnt_collect_1m then 1.0 when cnt_buy_1m=0 then 0.0 else " + "cnt_buy_1m*1.0/cnt_collect_1m end AS rt_collect2buy_1m," + "case when cnt_buy_1m>cnt_cart_1m then 1.0 when cnt_buy_1m=0 then 0.0 else cnt_buy_1m*1.0/cnt_cart_1m " + "end AS rt_cart2buy_1m," + "case when cnt_click_3d=0 then 0.0 when cnt_click_3d>=120.0*cnt_click_3d2nd then 120.0 else " + "cnt_click_3d*1.0/cnt_click_3d2nd end AS rt_click_3d," + "case when cnt_buy_3d=0 then 0.0 when cnt_buy_3d>=10.0*cnt_buy_3d2nd then 10.0 else " + "cnt_buy_3d*1.0/cnt_buy_3d2nd end AS rt_buy_3d," + "case when cnt_collect_3d=0 then 0.0 when cnt_collect_3d>=10.0*cnt_collect_3d2nd then 10.0 else " + "cnt_collect_3d*1.0/cnt_collect_3d2nd end AS rt_collect_3d," + "case when cnt_cart_3d=0 then 0.0 when cnt_cart_3d>=20.0*cnt_cart_3d2nd then 20.0 else " + "cnt_cart_3d*1.0/cnt_cart_3d2nd end AS rt_cart_3d," + "case when cnt_click_1w=0 then 0.0 when cnt_click_1w>=300.0*cnt_click_w2nd then 300.0 else " + "cnt_click_1w*1.0/cnt_click_w2nd end AS rt_click_1w," + "case when cnt_buy_1w=0 then 0.0 when cnt_buy_1w>=15.0*cnt_buy_w2nd then 15.0 else " + "cnt_buy_1w*1.0/cnt_buy_w2nd end AS rt_buy_1w," + "case when cnt_collect_1w=0 then 0.0 when cnt_collect_1w>=15.0*cnt_collect_w2nd then 15.0 else " + "cnt_collect_1w*1.0/cnt_collect_w2nd end AS rt_collect_1w," + "case when cnt_cart_1w=0 then 0.0 when cnt_cart_1w>=40.0*cnt_cart_w2nd then 40.0 else " + "cnt_cart_1w*1.0/cnt_cart_w2nd end AS rt_cart_1w," + "case when cnt_click_1m=0 then 0.0 when cnt_click_1m>=500.0*cnt_click_m2nd then 500.0 else " + "cnt_click_1m*1.0/cnt_click_m2nd end AS rt_click_1m," + "case when cnt_buy_1m=0 then 0.0 when cnt_buy_1m>=30.0*cnt_buy_m2nd then 30.0 else " + "cnt_buy_1m*1.0/cnt_buy_m2nd end AS rt_buy_1m," + "case when cnt_collect_1m=0 then 0.0 when cnt_collect_1m>=30.0*cnt_collect_m2nd then 30.0 else " + "cnt_collect_1m*1.0/cnt_collect_m2nd end AS rt_collect_1m," + "case when cnt_cart_1m=0 then 0.0 when cnt_cart_1m>=50.0*cnt_cart_m2nd then 50.0 else " + "cnt_cart_1m*1.0/cnt_cart_m2nd end AS rt_cart_1m";
    t1_userbrand = t1_userbrand.select(clauseUserBrand_Rate);
    String clauseUser = "user_id, " + "SUM(is_click) as user_cnt_click, SUM(is_buy) as user_cnt_buy, " + "SUM(is_collect) as user_cnt_collect, SUM(is_cart) as user_cnt_cart, " + "SUM(is_click_1m) as user_cnt_click_1m, SUM(is_buy_1m) as user_cnt_buy_1m, " + "SUM(is_collect_1m) as user_cnt_collect_1m, SUM(is_cart_1m) as user_cnt_cart_1m, " + "SUM(is_click_2m) as user_cnt_click_2m, SUM(is_buy_2m) as user_cnt_buy_2m, " + "SUM(is_collect_2m) as user_cnt_collect_2m, SUM(is_cart_2m) as user_cnt_cart_2m, " + "SUM(is_click_3m) as user_cnt_click_3m, SUM(is_buy_3m) as user_cnt_buy_3m, " + "SUM(is_collect_3m) as user_cnt_collect_3m, SUM(is_cart_3m) as user_cnt_cart_3m, " + "SUM(is_click_m2nd) as user_cnt_click_m2nd, SUM(is_buy_m2nd) as user_cnt_buy_m2nd, " + "SUM(is_collect_m2nd) as user_cnt_collect_m2nd, SUM(is_cart_m2nd) as user_cnt_cart_m2nd, " + "SUM(is_click_m3th) as user_cnt_click_m3th, SUM(is_buy_m3th) as user_cnt_buy_m3th, " + "SUM(is_collect_m3th) as user_cnt_collect_m3th, SUM(is_cart_m3th) as user_cnt_cart_m3th, " + "SUM(is_click_3d) as user_cnt_click_3d, SUM(is_buy_3d) as user_cnt_buy_3d, " + "SUM(is_collect_3d) as user_cnt_collect_3d, SUM(is_cart_3d) as user_cnt_cart_3d, " + "SUM(is_click_3d2nd) as user_cnt_click_3d2nd, SUM(is_buy_3d2nd) as user_cnt_buy_3d2nd, " + "SUM(is_collect_3d2nd) as user_cnt_collect_3d2nd, SUM(is_cart_3d2nd) as user_cnt_cart_3d2nd, " + "SUM(is_click_3d3th) as user_cnt_click_3d3th, SUM(is_buy_3d3th) as user_cnt_buy_3d3th, " + "SUM(is_collect_3d3th) as user_cnt_collect_3d3th, SUM(is_cart_3d3th) as user_cnt_cart_3d3th, " + "SUM(is_click_1w) as user_cnt_click_1w, SUM(is_buy_1w) as user_cnt_buy_1w, " + "SUM(is_collect_1w) as user_cnt_collect_1w, SUM(is_cart_1w) as user_cnt_cart_1w, " + "SUM(is_click_w2nd) as user_cnt_click_w2nd, SUM(is_buy_w2nd) as user_cnt_buy_w2nd, " + "SUM(is_collect_w2nd) as user_cnt_collect_w2nd, SUM(is_cart_w2nd) as user_cnt_cart_w2nd, " + "SUM(is_click_w3th) as user_cnt_click_w3th, SUM(is_buy_w3th) as user_cnt_buy_w3th, " + "SUM(is_collect_w3th) as user_cnt_collect_w3th, SUM(is_cart_w3th) as user_cnt_cart_w3th";
    BatchOperator t1_user = t1_preproc.groupBy("user_id", clauseUser);
    String clauseUser_Rate = "user_id AS user_id4join," + "user_cnt_click,user_cnt_buy,user_cnt_collect,user_cnt_cart," + "user_cnt_click_1m,user_cnt_buy_1m,user_cnt_collect_1m,user_cnt_cart_1m," + "user_cnt_click_2m,user_cnt_buy_2m,user_cnt_collect_2m,user_cnt_cart_2m," + "user_cnt_click_3m,user_cnt_buy_3m,user_cnt_collect_3m,user_cnt_cart_3m," + "user_cnt_click_m2nd,user_cnt_buy_m2nd,user_cnt_collect_m2nd,user_cnt_cart_m2nd," + "user_cnt_click_m3th,user_cnt_buy_m3th,user_cnt_collect_m3th,user_cnt_cart_m3th," + "user_cnt_click_3d,user_cnt_buy_3d,user_cnt_collect_3d,user_cnt_cart_3d," + "user_cnt_click_3d2nd,user_cnt_buy_3d2nd,user_cnt_collect_3d2nd,user_cnt_cart_3d2nd," + "user_cnt_click_3d3th,user_cnt_buy_3d3th,user_cnt_collect_3d3th,user_cnt_cart_3d3th," + "user_cnt_click_1w,user_cnt_buy_1w,user_cnt_collect_1w,user_cnt_cart_1w," + "user_cnt_click_w2nd,user_cnt_buy_w2nd,user_cnt_collect_w2nd,user_cnt_cart_w2nd," + "user_cnt_click_w3th,user_cnt_buy_w3th,user_cnt_collect_w3th,user_cnt_cart_w3th," + "case when user_cnt_buy>user_cnt_click then 1.0 when user_cnt_buy=0 then 0.0 else " + "user_cnt_buy*1.0/user_cnt_click end AS user_rt_click2buy," + "case when user_cnt_buy>user_cnt_collect then 1.0 when user_cnt_buy=0 then 0.0 else " + "user_cnt_buy*1.0/user_cnt_collect end AS user_rt_collect2buy," + "case when user_cnt_buy>user_cnt_cart then 1.0 when user_cnt_buy=0 then 0.0 else " + "user_cnt_buy*1.0/user_cnt_cart end AS user_rt_cart2buy," + "case when user_cnt_buy_3d>user_cnt_click_3d then 1.0 when user_cnt_buy_3d=0 then 0.0 else " + "user_cnt_buy_3d*1.0/user_cnt_click_3d end AS user_rt_click2buy_3d," + "case when user_cnt_buy_3d>user_cnt_collect_3d then 1.0 when user_cnt_buy_3d=0 then 0.0 else " + "user_cnt_buy_3d*1.0/user_cnt_collect_3d end AS user_rt_collect2buy_3d," + "case when user_cnt_buy_3d>user_cnt_cart_3d then 1.0 when user_cnt_buy_3d=0 then 0.0 else " + "user_cnt_buy_3d*1.0/user_cnt_cart_3d end AS user_rt_cart2buy_3d," + "case when user_cnt_buy_1w>user_cnt_click_1w then 1.0 when user_cnt_buy_1w=0 then 0.0 else " + "user_cnt_buy_1w*1.0/user_cnt_click_1w end AS user_rt_click2buy_1w," + "case when user_cnt_buy_1w>user_cnt_collect_1w then 1.0 when user_cnt_buy_1w=0 then 0.0 else " + "user_cnt_buy_1w*1.0/user_cnt_collect_1w end AS user_rt_collect2buy_1w," + "case when user_cnt_buy_1w>user_cnt_cart_1w then 1.0 when user_cnt_buy_1w=0 then 0.0 else " + "user_cnt_buy_1w*1.0/user_cnt_cart_1w end AS user_rt_cart2buy_1w," + "case when user_cnt_buy_1m>user_cnt_click_1m then 1.0 when user_cnt_buy_1m=0 then 0.0 else " + "user_cnt_buy_1m*1.0/user_cnt_click_1m end AS user_rt_click2buy_1m," + "case when user_cnt_buy_1m>user_cnt_collect_1m then 1.0 when user_cnt_buy_1m=0 then 0.0 else " + "user_cnt_buy_1m*1.0/user_cnt_collect_1m end AS user_rt_collect2buy_1m," + "case when user_cnt_buy_1m>user_cnt_cart_1m then 1.0 when user_cnt_buy_1m=0 then 0.0 else " + "user_cnt_buy_1m*1.0/user_cnt_cart_1m end AS user_rt_cart2buy_1m";
    t1_user = t1_user.select(clauseUser_Rate);
    String clauseBrand = "brand_id, " + "SUM(is_click) as brand_cnt_click, SUM(is_buy) as brand_cnt_buy, " + "SUM(is_collect) as brand_cnt_collect, SUM(is_cart) as brand_cnt_cart, " + "SUM(is_click_1m) as brand_cnt_click_1m, SUM(is_buy_1m) as brand_cnt_buy_1m, " + "SUM(is_collect_1m) as brand_cnt_collect_1m, SUM(is_cart_1m) as brand_cnt_cart_1m, " + "SUM(is_click_2m) as brand_cnt_click_2m, SUM(is_buy_2m) as brand_cnt_buy_2m, " + "SUM(is_collect_2m) as brand_cnt_collect_2m, SUM(is_cart_2m) as brand_cnt_cart_2m, " + "SUM(is_click_3m) as brand_cnt_click_3m, SUM(is_buy_3m) as brand_cnt_buy_3m, " + "SUM(is_collect_3m) as brand_cnt_collect_3m, SUM(is_cart_3m) as brand_cnt_cart_3m, " + "SUM(is_click_m2nd) as brand_cnt_click_m2nd, SUM(is_buy_m2nd) as brand_cnt_buy_m2nd, " + "SUM(is_collect_m2nd) as brand_cnt_collect_m2nd, SUM(is_cart_m2nd) as brand_cnt_cart_m2nd, " + "SUM(is_click_m3th) as brand_cnt_click_m3th, SUM(is_buy_m3th) as brand_cnt_buy_m3th, " + "SUM(is_collect_m3th) as brand_cnt_collect_m3th, SUM(is_cart_m3th) as brand_cnt_cart_m3th, " + "SUM(is_click_3d) as brand_cnt_click_3d, SUM(is_buy_3d) as brand_cnt_buy_3d, " + "SUM(is_collect_3d) as brand_cnt_collect_3d, SUM(is_cart_3d) as brand_cnt_cart_3d, " + "SUM(is_click_3d2nd) as brand_cnt_click_3d2nd, SUM(is_buy_3d2nd) as brand_cnt_buy_3d2nd, " + "SUM(is_collect_3d2nd) as brand_cnt_collect_3d2nd, SUM(is_cart_3d2nd) as brand_cnt_cart_3d2nd, " + "SUM(is_click_3d3th) as brand_cnt_click_3d3th, SUM(is_buy_3d3th) as brand_cnt_buy_3d3th, " + "SUM(is_collect_3d3th) as brand_cnt_collect_3d3th, SUM(is_cart_3d3th) as brand_cnt_cart_3d3th, " + "SUM(is_click_1w) as brand_cnt_click_1w, SUM(is_buy_1w) as brand_cnt_buy_1w, " + "SUM(is_collect_1w) as brand_cnt_collect_1w, SUM(is_cart_1w) as brand_cnt_cart_1w, " + "SUM(is_click_w2nd) as brand_cnt_click_w2nd, SUM(is_buy_w2nd) as brand_cnt_buy_w2nd, " + "SUM(is_collect_w2nd) as brand_cnt_collect_w2nd, SUM(is_cart_w2nd) as brand_cnt_cart_w2nd, " + "SUM(is_click_w3th) as brand_cnt_click_w3th, SUM(is_buy_w3th) as brand_cnt_buy_w3th, " + "SUM(is_collect_w3th) as brand_cnt_collect_w3th, SUM(is_cart_w3th) as brand_cnt_cart_w3th";
    BatchOperator t1_brand = t1_preproc.groupBy("brand_id", clauseBrand);
    String clauseBrand_Rate = "brand_id AS brand_id4join," + "brand_cnt_click,brand_cnt_buy,brand_cnt_collect,brand_cnt_cart," + "brand_cnt_click_1m,brand_cnt_buy_1m,brand_cnt_collect_1m,brand_cnt_cart_1m," + "brand_cnt_click_2m,brand_cnt_buy_2m,brand_cnt_collect_2m,brand_cnt_cart_2m," + "brand_cnt_click_3m,brand_cnt_buy_3m,brand_cnt_collect_3m,brand_cnt_cart_3m," + "brand_cnt_click_m2nd,brand_cnt_buy_m2nd,brand_cnt_collect_m2nd,brand_cnt_cart_m2nd," + "brand_cnt_click_m3th,brand_cnt_buy_m3th,brand_cnt_collect_m3th,brand_cnt_cart_m3th," + "brand_cnt_click_3d,brand_cnt_buy_3d,brand_cnt_collect_3d,brand_cnt_cart_3d," + "brand_cnt_click_3d2nd,brand_cnt_buy_3d2nd,brand_cnt_collect_3d2nd,brand_cnt_cart_3d2nd," + "brand_cnt_click_3d3th,brand_cnt_buy_3d3th,brand_cnt_collect_3d3th,brand_cnt_cart_3d3th," + "brand_cnt_click_1w,brand_cnt_buy_1w,brand_cnt_collect_1w,brand_cnt_cart_1w," + "brand_cnt_click_w2nd,brand_cnt_buy_w2nd,brand_cnt_collect_w2nd,brand_cnt_cart_w2nd," + "brand_cnt_click_w3th,brand_cnt_buy_w3th,brand_cnt_collect_w3th,brand_cnt_cart_w3th," + "case when brand_cnt_buy>brand_cnt_click then 1.0 when brand_cnt_buy=0 then 0.0 else " + "brand_cnt_buy*1.0/brand_cnt_click end AS brand_rt_click2buy," + "case when brand_cnt_buy>brand_cnt_collect then 1.0 when brand_cnt_buy=0 then 0.0 else " + "brand_cnt_buy*1.0/brand_cnt_collect end AS brand_rt_collect2buy," + "case when brand_cnt_buy>brand_cnt_cart then 1.0 when brand_cnt_buy=0 then 0.0 else " + "brand_cnt_buy*1.0/brand_cnt_cart end AS brand_rt_cart2buy," + "case when brand_cnt_buy_3d>brand_cnt_click_3d then 1.0 when brand_cnt_buy_3d=0 then 0.0 else " + "brand_cnt_buy_3d*1.0/brand_cnt_click_3d end AS brand_rt_click2buy_3d," + "case when brand_cnt_buy_3d>brand_cnt_collect_3d then 1.0 when brand_cnt_buy_3d=0 then 0.0 else " + "brand_cnt_buy_3d*1.0/brand_cnt_collect_3d end AS brand_rt_collect2buy_3d," + "case when brand_cnt_buy_3d>brand_cnt_cart_3d then 1.0 when brand_cnt_buy_3d=0 then 0.0 else " + "brand_cnt_buy_3d*1.0/brand_cnt_cart_3d end AS brand_rt_cart2buy_3d," + "case when brand_cnt_buy_1w>brand_cnt_click_1w then 1.0 when brand_cnt_buy_1w=0 then 0.0 else " + "brand_cnt_buy_1w*1.0/brand_cnt_click_1w end AS brand_rt_click2buy_1w," + "case when brand_cnt_buy_1w>brand_cnt_collect_1w then 1.0 when brand_cnt_buy_1w=0 then 0.0 else " + "brand_cnt_buy_1w*1.0/brand_cnt_collect_1w end AS brand_rt_collect2buy_1w," + "case when brand_cnt_buy_1w>brand_cnt_cart_1w then 1.0 when brand_cnt_buy_1w=0 then 0.0 else " + "brand_cnt_buy_1w*1.0/brand_cnt_cart_1w end AS brand_cart2buy_1w," + "case when brand_cnt_buy_1m>brand_cnt_click_1m then 1.0 when brand_cnt_buy_1m=0 then 0.0 else " + "brand_cnt_buy_1m*1.0/brand_cnt_click_1m end AS brand_rt_click2buy_1m," + "case when brand_cnt_buy_1m>brand_cnt_collect_1m then 1.0 when brand_cnt_buy_1m=0 then 0.0 else " + "brand_cnt_buy_1m*1.0/brand_cnt_collect_1m end AS brand_rt_collect2buy_1m," + "case when brand_cnt_buy_1m>brand_cnt_cart_1m then 1.0 when brand_cnt_buy_1m=0 then 0.0 else " + "brand_cnt_buy_1m*1.0/brand_cnt_cart_1m end AS brand_rt_cart2buy_1m";
    t1_brand = t1_brand.select(clauseBrand_Rate);
    BatchOperator t1_join = new JoinBatchOp().setSelectClause("*").setJoinPredicate("user_id=user_id4join").linkFrom(t1_userbrand, t1_user);
    t1_join = new JoinBatchOp().setSelectClause("*").setJoinPredicate("brand_id=brand_id4join").linkFrom(t1_join, t1_brand);
    BatchOperator t2_label = t2.filter("type=1").select("user_id AS user_id4label, brand_id AS brand_id4label, 1 as label").distinct();
    BatchOperator feature_label = new LeftOuterJoinBatchOp().setSelectClause("*").setJoinPredicate("user_id = user_id4label AND brand_id = brand_id4label").linkFrom(t1_join, t2_label);
    Imputer imputer = new Imputer().setStrategy("value").setFillValue("0").setSelectedCols("label");
    feature_label = imputer.fit(feature_label).transform(feature_label);
    System.out.println(feature_label.getSchema());
    String[] featureColNames = ArrayUtils.removeElements(feature_label.getColNames(), new String[] { "user_id", "brand_id", "user_id4join", "brand_id4join", "user_id4label", "brand_id4label", LABEL_COL_NAME });
    StringBuilder sbd = new StringBuilder();
    for (String name : featureColNames) {
        sbd.append("CAST(").append(name).append(" AS DOUBLE) AS ").append(name).append(", ");
    }
    sbd.append(LABEL_COL_NAME);
    feature_label.select(sbd.toString()).link(new AkSinkBatchOp().setFilePath(DATA_DIR + FEATURE_LABEL_FILE).setOverwriteSink(true));
    BatchOperator.execute();
}
Also used : LeftOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp) Imputer(com.alibaba.alink.pipeline.dataproc.Imputer) LeftOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp) JoinBatchOp(com.alibaba.alink.operator.batch.sql.JoinBatchOp) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator)

Example 3 with LeftOuterJoinBatchOp

use of com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp in project Alink by alibaba.

the class FmRecommTrainBatchOp method linkFrom.

/**
 * There are 3 input tables: 1) user-item-label table, 2) user features table, 3) item features table.
 * If user or item features table is missing, then use their IDs as features.
 */
@Override
public FmRecommTrainBatchOp linkFrom(BatchOperator<?>... inputs) {
    BatchOperator<?> samplesOp = inputs[0];
    final Long envId = samplesOp.getMLEnvironmentId();
    BatchOperator<?> userFeaturesOp = inputs.length >= 2 ? inputs[1] : null;
    BatchOperator<?> itemFeaturesOp = inputs.length >= 3 ? inputs[2] : null;
    Params params = getParams();
    String userCol = params.get(USER_COL);
    String itemCol = params.get(ITEM_COL);
    String labelCol = params.get(RATE_COL);
    String[] userFeatureCols = params.get(USER_FEATURE_COLS);
    String[] itemFeatureCols = params.get(ITEM_FEATURE_COLS);
    String[] userCateFeatureCols = params.get(USER_CATEGORICAL_FEATURE_COLS);
    String[] itemCateFeatureCols = params.get(ITEM_CATEGORICAL_FEATURE_COLS);
    if (userFeaturesOp == null) {
        userFeaturesOp = samplesOp.select("`" + userCol + "`").distinct();
        userFeatureCols = new String[] { userCol };
        userCateFeatureCols = new String[] { userCol };
    } else {
        Preconditions.checkArgument(TableUtil.findColTypeWithAssert(userFeaturesOp.getSchema(), userCol).equals(TableUtil.findColTypeWithAssert(samplesOp.getSchema(), userCol)), "user column type mismatch");
    }
    if (itemFeaturesOp == null) {
        itemFeaturesOp = samplesOp.select("`" + itemCol + "`").distinct();
        itemFeatureCols = new String[] { itemCol };
        itemCateFeatureCols = new String[] { itemCol };
    } else {
        Preconditions.checkArgument(TableUtil.findColTypeWithAssert(itemFeaturesOp.getSchema(), itemCol).equals(TableUtil.findColTypeWithAssert(samplesOp.getSchema(), itemCol)), "item column type mismatch");
    }
    BatchOperator<?> history = samplesOp.select(new String[] { userCol, itemCol });
    userFeaturesOp = createFeatureVectors(userFeaturesOp, userCol, userFeatureCols, userCateFeatureCols);
    itemFeaturesOp = createFeatureVectors(itemFeaturesOp, itemCol, itemFeatureCols, itemCateFeatureCols);
    LeftOuterJoinBatchOp joinOp1 = new LeftOuterJoinBatchOp().setMLEnvironmentId(envId).setJoinPredicate("a.`" + userCol + "`=" + "b.`" + userCol + "`").setSelectClause("a.*, b.__fm_features__ as __user_features__");
    LeftOuterJoinBatchOp joinOp2 = new LeftOuterJoinBatchOp().setMLEnvironmentId(envId).setJoinPredicate("a.`" + itemCol + "`=" + "b.`" + itemCol + "`").setSelectClause("a.*, b.__fm_features__ as __item_features__");
    samplesOp = joinOp1.linkFrom(samplesOp, userFeaturesOp);
    samplesOp = joinOp2.linkFrom(samplesOp, itemFeaturesOp);
    samplesOp = samplesOp.udf("__user_features__", "__user_features__", new CheckNotNull());
    samplesOp = samplesOp.udf("__item_features__", "__item_features__", new CheckNotNull());
    VectorAssembler va = new VectorAssembler().setMLEnvironmentId(envId).setSelectedCols("__user_features__", "__item_features__").setOutputCol("__alink_features__").setReservedCols(labelCol);
    samplesOp = va.transform(samplesOp);
    BatchOperator<?> fmModel;
    if (!implicitFeedback) {
        fmModel = new FmRegressorTrainBatchOp(params).setLabelCol(params.get(RATE_COL)).setVectorCol("__alink_features__").setMLEnvironmentId(envId);
    } else {
        fmModel = new FmClassifierTrainBatchOp(params).setLabelCol(params.get(RATE_COL)).setVectorCol("__alink_features__").setMLEnvironmentId(envId);
    }
    fmModel.linkFrom(samplesOp);
    BatchOperator<?> model = PackBatchOperatorUtil.packBatchOps(new BatchOperator<?>[] { fmModel, userFeaturesOp, itemFeaturesOp, history });
    setOutputTable(model.getOutputTable());
    return this;
}
Also used : LeftOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp) FmRegressorTrainBatchOp(com.alibaba.alink.operator.batch.regression.FmRegressorTrainBatchOp) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) FmRecommTrainParams(com.alibaba.alink.params.recommendation.FmRecommTrainParams) Params(org.apache.flink.ml.api.misc.param.Params) FmClassifierTrainBatchOp(com.alibaba.alink.operator.batch.classification.FmClassifierTrainBatchOp)

Example 4 with LeftOuterJoinBatchOp

use of com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp in project Alink by alibaba.

the class HugeMfAlsImpl method factorize.

public static Tuple2<BatchOperator<?>, BatchOperator<?>> factorize(BatchOperator<?> data, Params params, boolean implicit) {
    final Long envId = data.getMLEnvironmentId();
    final String userColName = params.get(AlsTrainParams.USER_COL);
    final String itemColName = params.get(AlsTrainParams.ITEM_COL);
    final String rateColName = params.get(AlsTrainParams.RATE_COL);
    final double lambda = params.get(AlsTrainParams.LAMBDA);
    final int rank = params.get(AlsTrainParams.RANK);
    final int numIter = params.get(AlsTrainParams.NUM_ITER);
    final boolean nonNegative = params.get(AlsTrainParams.NON_NEGATIVE);
    final double alpha = params.get(AlsImplicitTrainParams.ALPHA);
    final int numMiniBatches = params.get(AlsTrainParams.NUM_BLOCKS);
    final int userColIdx = TableUtil.findColIndexWithAssert(data.getColNames(), userColName);
    final int itemColIdx = TableUtil.findColIndexWithAssert(data.getColNames(), itemColName);
    final int rateColIdx = TableUtil.findColIndexWithAssert(data.getColNames(), rateColName);
    TypeInformation<?> userType = data.getColTypes()[userColIdx];
    TypeInformation<?> itemType = data.getColTypes()[itemColIdx];
    boolean isLongTypeId = userType.equals(Types.LONG) && itemType.equals(Types.LONG);
    BatchOperator<?> distinctUsers = null;
    BatchOperator<?> distinctItems = null;
    if (!isLongTypeId) {
        distinctUsers = data.select("`" + userColName + "`").distinct();
        distinctItems = data.select("`" + itemColName + "`").distinct();
        MultiStringIndexerTrainBatchOp msi = new MultiStringIndexerTrainBatchOp().setMLEnvironmentId(envId).setSelectedCols(userColName, itemColName).linkFrom(data);
        MultiStringIndexerPredictBatchOp msiPredictor = new MultiStringIndexerPredictBatchOp().setMLEnvironmentId(envId).setSelectedCols(userColName, itemColName);
        data = msiPredictor.linkFrom(msi, data);
        distinctUsers = new MultiStringIndexerPredictBatchOp().setMLEnvironmentId(envId).setSelectedCols(userColName).setOutputCols("__user_index").linkFrom(msi, distinctUsers);
        distinctItems = new MultiStringIndexerPredictBatchOp().setMLEnvironmentId(envId).setSelectedCols(itemColName).setOutputCols("__item_index").linkFrom(msi, distinctItems);
    }
    // tuple3: userId, itemId, rating
    DataSet<Tuple3<Long, Long, Float>> alsInput = data.getDataSet().map(new MapFunction<Row, Tuple3<Long, Long, Float>>() {

        private static final long serialVersionUID = 6671683813980584160L;

        @Override
        public Tuple3<Long, Long, Float> map(Row value) {
            Object user = value.getField(userColIdx);
            Object item = value.getField(itemColIdx);
            Object rating = value.getField(rateColIdx);
            Preconditions.checkNotNull(user, "user is null");
            Preconditions.checkNotNull(item, "item is null");
            Preconditions.checkNotNull(rating, "rating is null");
            return new Tuple3<>(((Number) user).longValue(), ((Number) item).longValue(), ((Number) rating).floatValue());
        }
    });
    AlsTrain als = new AlsTrain(rank, numIter, lambda, implicit, alpha, numMiniBatches, nonNegative);
    DataSet<Tuple3<Byte, Long, float[]>> factors = als.fit(alsInput);
    BatchOperator<?> userFactors = getFactors(envId, factors, userColName, (byte) 0);
    BatchOperator<?> itemFactors = getFactors(envId, factors, itemColName, (byte) 1);
    if (!isLongTypeId) {
        BatchOperator<?> joinUser = new LeftOuterJoinBatchOp().setMLEnvironmentId(envId).setJoinPredicate(String.format("a.`%s`=b.__user_index", userColName)).setSelectClause(String.format("b.`%s`, a.`%s`", userColName, "factors"));
        userFactors = joinUser.linkFrom(userFactors, distinctUsers);
        BatchOperator<?> joinItem = new LeftOuterJoinBatchOp().setMLEnvironmentId(envId).setJoinPredicate(String.format("a.`%s`=b.__item_index", itemColName)).setSelectClause(String.format("b.`%s`, a.`%s`", itemColName, "factors"));
        itemFactors = joinItem.linkFrom(itemFactors, distinctItems);
    }
    return Tuple2.of(userFactors, itemFactors);
}
Also used : LeftOuterJoinBatchOp(com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp) MultiStringIndexerTrainBatchOp(com.alibaba.alink.operator.batch.dataproc.MultiStringIndexerTrainBatchOp) MultiStringIndexerPredictBatchOp(com.alibaba.alink.operator.batch.dataproc.MultiStringIndexerPredictBatchOp) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Row(org.apache.flink.types.Row)

Aggregations

LeftOuterJoinBatchOp (com.alibaba.alink.operator.batch.sql.LeftOuterJoinBatchOp)4 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)2 JoinBatchOp (com.alibaba.alink.operator.batch.sql.JoinBatchOp)2 FmClassifierTrainBatchOp (com.alibaba.alink.operator.batch.classification.FmClassifierTrainBatchOp)1 MultiStringIndexerPredictBatchOp (com.alibaba.alink.operator.batch.dataproc.MultiStringIndexerPredictBatchOp)1 MultiStringIndexerTrainBatchOp (com.alibaba.alink.operator.batch.dataproc.MultiStringIndexerTrainBatchOp)1 FmRegressorTrainBatchOp (com.alibaba.alink.operator.batch.regression.FmRegressorTrainBatchOp)1 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)1 FullOuterJoinBatchOp (com.alibaba.alink.operator.batch.sql.FullOuterJoinBatchOp)1 RightOuterJoinBatchOp (com.alibaba.alink.operator.batch.sql.RightOuterJoinBatchOp)1 FmRecommTrainParams (com.alibaba.alink.params.recommendation.FmRecommTrainParams)1 Imputer (com.alibaba.alink.pipeline.dataproc.Imputer)1 VectorAssembler (com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler)1 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)1 Params (org.apache.flink.ml.api.misc.param.Params)1 Row (org.apache.flink.types.Row)1