use of com.rainsoft.domain.TaskBean in project beijingThirdPeriod by weidongcao.
the class BcpFileImport method getHttpTask.
/**
* 网页的任务
* @return
*/
private static TaskBean getHttpTask() {
String task = BigDataConstants.CONTENT_TYPE_HTTP;
TaskBean http = new TaskBean();
// BCP文件路径
http.setBcpPath(ConfigurationManager.getProperty("bcp.file.path") + "/" + task);
// HBase表名
http.setHbaseTableName(NamingRuleUtils.getHBaseTableName(task));
// HBase列簇
http.setHbaseCF(NamingRuleUtils.getHBaseContentTableCF());
// HFile在HDFS上的临时存储目录
http.setHfileTmpStorePath(NamingRuleUtils.getHFileTaskDir(NamingRuleUtils.getBcpTaskKey(task)));
// 数据类型
http.setContentType(task);
// 全部字段名数组
http.setColumns(FieldConstants.BCP_FILE_COLUMN_MAP.get(NamingRuleUtils.getBcpTaskKey(task)));
// 需要过滤的关键字段
http.setKeyColumns(new String[] { "ref_domain" });
logger.info("任务信息: {}", http.toString());
return http;
}
use of com.rainsoft.domain.TaskBean in project beijingThirdPeriod by weidongcao.
the class BcpImportHBaseSolrService method bcpWriteIntoSolr.
public static void bcpWriteIntoSolr(JavaRDD<String[]> javaRDD, TaskBean task) {
logger.info("开始将 {} 的BCP数据索引到Solr", task.getContentType());
/*
* 数据写入Solr
*/
javaRDD.foreachPartition((VoidFunction<Iterator<String[]>>) iterator -> {
List<SolrInputDocument> list = new ArrayList<>();
while (iterator.hasNext()) {
String[] str = iterator.next();
SolrInputDocument doc = new SolrInputDocument();
String rowkey = str[0];
doc.addField("ID", rowkey.split("_")[1]);
doc.addField(BigDataConstants.SOLR_CONTENT_ID.toUpperCase(), rowkey);
doc.addField(BigDataConstants.SOLR_DOC_TYPE_KEY, FieldConstants.DOC_TYPE_MAP.get(task.getContentType()));
doc.addField(BigDataConstants.CAPTURE_TIME, rowkey.split("_")[0]);
Date curDate = new Date();
doc.addField("import_time".toUpperCase(), DateFormatUtils.DATE_TIME_FORMAT.format(curDate));
doc.addField("import_time".toLowerCase(), curDate.getTime());
for (int i = 1; i < str.length; i++) {
String value = str[i];
if (task.getColumns().length <= i - 1) {
break;
}
String key = task.getColumns()[i - 1].toUpperCase();
if ((null != value) && (!"".equals(value))) {
doc.addField(key, value);
}
}
list.add(doc);
}
if (list.size() > 0) {
SolrUtil.setCloudSolrClientDefaultCollection(client, new Date());
client.add(list, 1000);
logger.info("---->写入Solr成功数据量:{}", list.size());
} else {
logger.info("{} 此Spark Partition 数据为空", task.getContentType());
}
});
logger.info("####### {}的BCP数据索引Solr完成 #######", task.getContentType());
}
use of com.rainsoft.domain.TaskBean in project beijingThirdPeriod by weidongcao.
the class Main method getHttpTask.
/**
* 网页的任务
* @return
*/
private static TaskBean getHttpTask() {
String task = BigDataConstants.CONTENT_TYPE_HTTP;
TaskBean http = new TaskBean();
http.setBcpPath(tsvDataPathTemplate.replace("${task}", task));
http.setCaptureTimeIndex(22);
http.setContentType(task);
http.setDocType(BigDataConstants.SOLR_DOC_TYPE_HTTP_VALUE);
http.setColumns(FieldConstants.BCP_FILE_COLUMN_MAP.get(NamingRuleUtils.getBcpTaskKey(task)));
http.setHfileTmpStorePath(NamingRuleUtils.getHFileTaskDir(NamingRuleUtils.getBcpTaskKey(task)));
http.setHbaseCF(NamingRuleUtils.getHBaseContentTableCF());
http.setHbaseTableName(NamingRuleUtils.getHBaseTableName(task));
return http;
}
use of com.rainsoft.domain.TaskBean in project beijingThirdPeriod by weidongcao.
the class SparkOperateBcp method run.
public static void run(TaskBean task) {
logger.info("开始处理 {} 的BCP数据", task.getContentType());
SparkConf conf = new SparkConf().setAppName(task.getContentType());
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> originalRDD = sc.textFile(task.getBcpPath());
// 对BCP文件数据进行基本的处理,并生成ID(HBase的RowKey,Solr的Sid)
JavaRDD<String[]> valueArrrayRDD = originalRDD.mapPartitions((FlatMapFunction<Iterator<String>, String[]>) iter -> {
List<String[]> list = new ArrayList<>();
while (iter.hasNext()) {
String str = iter.next();
String[] fields = str.split("\t");
list.add(fields);
}
return list.iterator();
});
/*
* 对数据进行过滤
* 字段名数组里没有id字段(HBase的RowKey,Solr的Side)
* BCP文件可能升级,添加了新的字段
* FTP、IM_CHAT表新加了三个字段:"service_code_out", "terminal_longitude", "terminal_latitude"
* HTTP表新了了7个字段其中三个字段与上面相同:"service_code_out", "terminal_longitude", "terminal_latitude"
* 另外4个字段是:"manufacturer_code", "zipname", "bcpname", "rownumber", "
* 故过滤的时候要把以上情况考虑进去
*/
JavaRDD<String[]> filterValuesRDD;
filterValuesRDD = valueArrrayRDD.filter((Function<String[], Boolean>) (String[] strings) -> // BCP文件 没有新加字段,
(task.getColumns().length + 1 == strings.length) || // BCP文件添加了新的字段,且只添加了三个字段
((task.getColumns().length + 1) == (strings.length + 3)) || // HTTP的BCP文件添加了新的字段,且添加了7个字段
(BigDataConstants.CONTENT_TYPE_HTTP.equalsIgnoreCase(task.getContentType()) && ((task.getColumns().length + 1) == (strings.length + 3 + 4))));
// BCP文件数据写入HBase
bcpWriteIntoHBase(filterValuesRDD, task);
sc.close();
}
use of com.rainsoft.domain.TaskBean in project beijingThirdPeriod by weidongcao.
the class BcpFileImport method main.
public static void main(String[] args) {
TaskBean ftp = getFtpTask();
TaskBean im_chat = getImchatTask();
TaskBean http = getHttpTask();
BcpImportHBaseSolrService bcpImportHBaseSolrService = new BcpImportHBaseSolrService();
while (true) {
bcpImportHBaseSolrService.bcpImportHBaseSolr(ftp);
bcpImportHBaseSolrService.bcpImportHBaseSolr(im_chat);
bcpImportHBaseSolrService.bcpImportHBaseSolr(http);
try {
logger.info("一次任务处理完成休眠5秒");
Thread.sleep(5 * 1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
Aggregations