Search in sources :

Example 1 with TaskBean

use of com.rainsoft.domain.TaskBean in project beijingThirdPeriod by weidongcao.

the class BcpFileImport method getHttpTask.

/**
 * 网页的任务
 * @return
 */
private static TaskBean getHttpTask() {
    String task = BigDataConstants.CONTENT_TYPE_HTTP;
    TaskBean http = new TaskBean();
    // BCP文件路径
    http.setBcpPath(ConfigurationManager.getProperty("bcp.file.path") + "/" + task);
    // HBase表名
    http.setHbaseTableName(NamingRuleUtils.getHBaseTableName(task));
    // HBase列簇
    http.setHbaseCF(NamingRuleUtils.getHBaseContentTableCF());
    // HFile在HDFS上的临时存储目录
    http.setHfileTmpStorePath(NamingRuleUtils.getHFileTaskDir(NamingRuleUtils.getBcpTaskKey(task)));
    // 数据类型
    http.setContentType(task);
    // 全部字段名数组
    http.setColumns(FieldConstants.BCP_FILE_COLUMN_MAP.get(NamingRuleUtils.getBcpTaskKey(task)));
    // 需要过滤的关键字段
    http.setKeyColumns(new String[] { "ref_domain" });
    logger.info("任务信息: {}", http.toString());
    return http;
}
Also used : TaskBean(com.rainsoft.domain.TaskBean)

Example 2 with TaskBean

use of com.rainsoft.domain.TaskBean in project beijingThirdPeriod by weidongcao.

the class BcpImportHBaseSolrService method bcpWriteIntoSolr.

public static void bcpWriteIntoSolr(JavaRDD<String[]> javaRDD, TaskBean task) {
    logger.info("开始将 {} 的BCP数据索引到Solr", task.getContentType());
    /*
         * 数据写入Solr
         */
    javaRDD.foreachPartition((VoidFunction<Iterator<String[]>>) iterator -> {
        List<SolrInputDocument> list = new ArrayList<>();
        while (iterator.hasNext()) {
            String[] str = iterator.next();
            SolrInputDocument doc = new SolrInputDocument();
            String rowkey = str[0];
            doc.addField("ID", rowkey.split("_")[1]);
            doc.addField(BigDataConstants.SOLR_CONTENT_ID.toUpperCase(), rowkey);
            doc.addField(BigDataConstants.SOLR_DOC_TYPE_KEY, FieldConstants.DOC_TYPE_MAP.get(task.getContentType()));
            doc.addField(BigDataConstants.CAPTURE_TIME, rowkey.split("_")[0]);
            Date curDate = new Date();
            doc.addField("import_time".toUpperCase(), DateFormatUtils.DATE_TIME_FORMAT.format(curDate));
            doc.addField("import_time".toLowerCase(), curDate.getTime());
            for (int i = 1; i < str.length; i++) {
                String value = str[i];
                if (task.getColumns().length <= i - 1) {
                    break;
                }
                String key = task.getColumns()[i - 1].toUpperCase();
                if ((null != value) && (!"".equals(value))) {
                    doc.addField(key, value);
                }
            }
            list.add(doc);
        }
        if (list.size() > 0) {
            SolrUtil.setCloudSolrClientDefaultCollection(client, new Date());
            client.add(list, 1000);
            logger.info("---->写入Solr成功数据量:{}", list.size());
        } else {
            logger.info("{} 此Spark Partition 数据为空", task.getContentType());
        }
    });
    logger.info("####### {}的BCP数据索引Solr完成 #######", task.getContentType());
}
Also used : java.util(java.util) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LoggerFactory(org.slf4j.LoggerFactory) ArrayUtils(org.apache.commons.lang3.ArrayUtils) VoidFunction(org.apache.spark.api.java.function.VoidFunction) StringUtils(org.apache.commons.lang3.StringUtils) DateFormatUtils(com.rainsoft.utils.DateFormatUtils) StorageLevel(org.apache.spark.storage.StorageLevel) TaskBean(com.rainsoft.domain.TaskBean) ClassPathXmlApplicationContext(org.springframework.context.support.ClassPathXmlApplicationContext) BigDataConstants(com.rainsoft.BigDataConstants) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) FileUtils(com.rainsoft.utils.io.FileUtils) Logger(org.slf4j.Logger) SolrUtil(com.rainsoft.utils.SolrUtil) SparkConf(org.apache.spark.SparkConf) RowkeyColumnSecondarySort(com.rainsoft.hbase.RowkeyColumnSecondarySort) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SolrClient(org.apache.solr.client.solrj.SolrClient) HBaseUtils(com.rainsoft.utils.HBaseUtils) IOUtils(org.apache.commons.io.IOUtils) AbstractApplicationContext(org.springframework.context.support.AbstractApplicationContext) java.io(java.io) ConfigurationManager(com.rainsoft.conf.ConfigurationManager) FieldConstants(com.rainsoft.FieldConstants) Function(org.apache.spark.api.java.function.Function) SolrInputDocument(org.apache.solr.common.SolrInputDocument) SolrInputDocument(org.apache.solr.common.SolrInputDocument)

Example 3 with TaskBean

use of com.rainsoft.domain.TaskBean in project beijingThirdPeriod by weidongcao.

the class Main method getHttpTask.

/**
 * 网页的任务
 * @return
 */
private static TaskBean getHttpTask() {
    String task = BigDataConstants.CONTENT_TYPE_HTTP;
    TaskBean http = new TaskBean();
    http.setBcpPath(tsvDataPathTemplate.replace("${task}", task));
    http.setCaptureTimeIndex(22);
    http.setContentType(task);
    http.setDocType(BigDataConstants.SOLR_DOC_TYPE_HTTP_VALUE);
    http.setColumns(FieldConstants.BCP_FILE_COLUMN_MAP.get(NamingRuleUtils.getBcpTaskKey(task)));
    http.setHfileTmpStorePath(NamingRuleUtils.getHFileTaskDir(NamingRuleUtils.getBcpTaskKey(task)));
    http.setHbaseCF(NamingRuleUtils.getHBaseContentTableCF());
    http.setHbaseTableName(NamingRuleUtils.getHBaseTableName(task));
    return http;
}
Also used : TaskBean(com.rainsoft.domain.TaskBean)

Example 4 with TaskBean

use of com.rainsoft.domain.TaskBean in project beijingThirdPeriod by weidongcao.

the class SparkOperateBcp method run.

public static void run(TaskBean task) {
    logger.info("开始处理 {} 的BCP数据", task.getContentType());
    SparkConf conf = new SparkConf().setAppName(task.getContentType());
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> originalRDD = sc.textFile(task.getBcpPath());
    // 对BCP文件数据进行基本的处理,并生成ID(HBase的RowKey,Solr的Sid)
    JavaRDD<String[]> valueArrrayRDD = originalRDD.mapPartitions((FlatMapFunction<Iterator<String>, String[]>) iter -> {
        List<String[]> list = new ArrayList<>();
        while (iter.hasNext()) {
            String str = iter.next();
            String[] fields = str.split("\t");
            list.add(fields);
        }
        return list.iterator();
    });
    /*
         * 对数据进行过滤
         * 字段名数组里没有id字段(HBase的RowKey,Solr的Side)
         * BCP文件可能升级,添加了新的字段
         * FTP、IM_CHAT表新加了三个字段:"service_code_out", "terminal_longitude", "terminal_latitude"
         * HTTP表新了了7个字段其中三个字段与上面相同:"service_code_out", "terminal_longitude", "terminal_latitude"
         *      另外4个字段是:"manufacturer_code", "zipname", "bcpname", "rownumber", "
         * 故过滤的时候要把以上情况考虑进去
         */
    JavaRDD<String[]> filterValuesRDD;
    filterValuesRDD = valueArrrayRDD.filter((Function<String[], Boolean>) (String[] strings) -> // BCP文件 没有新加字段,
    (task.getColumns().length + 1 == strings.length) || // BCP文件添加了新的字段,且只添加了三个字段
    ((task.getColumns().length + 1) == (strings.length + 3)) || // HTTP的BCP文件添加了新的字段,且添加了7个字段
    (BigDataConstants.CONTENT_TYPE_HTTP.equalsIgnoreCase(task.getContentType()) && ((task.getColumns().length + 1) == (strings.length + 3 + 4))));
    // BCP文件数据写入HBase
    bcpWriteIntoHBase(filterValuesRDD, task);
    sc.close();
}
Also used : PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) Date(java.util.Date) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LoggerFactory(org.slf4j.LoggerFactory) ArrayUtils(org.apache.commons.lang3.ArrayUtils) VoidFunction(org.apache.spark.api.java.function.VoidFunction) DateFormatUtils(com.rainsoft.utils.DateFormatUtils) ArrayList(java.util.ArrayList) TaskBean(com.rainsoft.domain.TaskBean) ClassPathXmlApplicationContext(org.springframework.context.support.ClassPathXmlApplicationContext) BigDataConstants(com.rainsoft.BigDataConstants) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) SolrUtil(com.rainsoft.utils.SolrUtil) SparkConf(org.apache.spark.SparkConf) RowkeyColumnSecondarySort(com.rainsoft.hbase.RowkeyColumnSecondarySort) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SolrClient(org.apache.solr.client.solrj.SolrClient) Serializable(java.io.Serializable) HBaseUtils(com.rainsoft.utils.HBaseUtils) List(java.util.List) AbstractApplicationContext(org.springframework.context.support.AbstractApplicationContext) FieldConstants(com.rainsoft.FieldConstants) Function(org.apache.spark.api.java.function.Function) SolrInputDocument(org.apache.solr.common.SolrInputDocument) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) VoidFunction(org.apache.spark.api.java.function.VoidFunction) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) Function(org.apache.spark.api.java.function.Function) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) List(java.util.List) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Example 5 with TaskBean

use of com.rainsoft.domain.TaskBean in project beijingThirdPeriod by weidongcao.

the class BcpFileImport method main.

public static void main(String[] args) {
    TaskBean ftp = getFtpTask();
    TaskBean im_chat = getImchatTask();
    TaskBean http = getHttpTask();
    BcpImportHBaseSolrService bcpImportHBaseSolrService = new BcpImportHBaseSolrService();
    while (true) {
        bcpImportHBaseSolrService.bcpImportHBaseSolr(ftp);
        bcpImportHBaseSolrService.bcpImportHBaseSolr(im_chat);
        bcpImportHBaseSolrService.bcpImportHBaseSolr(http);
        try {
            logger.info("一次任务处理完成休眠5秒");
            Thread.sleep(5 * 1000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}
Also used : TaskBean(com.rainsoft.domain.TaskBean)

Aggregations

TaskBean (com.rainsoft.domain.TaskBean)10 BigDataConstants (com.rainsoft.BigDataConstants)3 FieldConstants (com.rainsoft.FieldConstants)3 RowkeyColumnSecondarySort (com.rainsoft.hbase.RowkeyColumnSecondarySort)3 DateFormatUtils (com.rainsoft.utils.DateFormatUtils)3 HBaseUtils (com.rainsoft.utils.HBaseUtils)3 SolrUtil (com.rainsoft.utils.SolrUtil)3 ArrayUtils (org.apache.commons.lang3.ArrayUtils)3 SolrClient (org.apache.solr.client.solrj.SolrClient)3 SolrInputDocument (org.apache.solr.common.SolrInputDocument)3 SparkConf (org.apache.spark.SparkConf)3 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)3 FlatMapFunction (org.apache.spark.api.java.function.FlatMapFunction)3 Function (org.apache.spark.api.java.function.Function)3 PairFlatMapFunction (org.apache.spark.api.java.function.PairFlatMapFunction)3 VoidFunction (org.apache.spark.api.java.function.VoidFunction)3 Logger (org.slf4j.Logger)3 LoggerFactory (org.slf4j.LoggerFactory)3