Search in sources :

Example 1 with VoidFunction

use of org.apache.spark.api.java.function.VoidFunction in project beijingThirdPeriod by weidongcao.

the class BaseOracleDataExport method export2Solr.

/**
 * Oracle内容表数据导入到Solr
 *
 * @param javaRDD JavaRDD<String[]>
 * @param task    任务名
 */
private static void export2Solr(JavaRDD<Row> javaRDD, String task) {
    // 字段名数组
    String[] columns = FieldConstants.ORACLE_TABLE_COLUMN_MAP.get(NamingRuleUtils.getOracleContentTableName(task));
    javaRDD.foreachPartition((VoidFunction<Iterator<Row>>) iterator -> {
        List<SolrInputDocument> docList = new ArrayList<>();
        java.util.Optional<Object> importTime = java.util.Optional.empty();
        while (iterator.hasNext()) {
            Row row = iterator.next();
            SolrInputDocument doc = new SolrInputDocument();
            String id = UUID.randomUUID().toString().replace("-", "");
            doc.addField("ID", id);
            doc.addField(BigDataConstants.SOLR_DOC_TYPE_KEY, FieldConstants.DOC_TYPE_MAP.get(task));
            for (int i = 0; i < row.length(); i++) {
                if (i >= columns.length)
                    break;
                if ("import_time".equalsIgnoreCase(columns[i])) {
                    importTime = Optional.of(row.getString(i));
                }
                SolrUtil.addSolrFieldValue(doc, columns[i].toUpperCase(), row.getString(i));
            }
            docList.add(doc);
            SolrUtil.submitToSolr(client, docList, writeSize, importTime);
        }
        SolrUtil.submitToSolr(client, docList, 1, importTime);
    });
    logger.info("####### {}的数据索引Solr完成 #######", NamingRuleUtils.getOracleContentTableName(task));
}
Also used : InfoDaoBaseInter(com.rainsoft.inter.InfoDaoBaseInter) java.util(java.util) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LoggerFactory(org.slf4j.LoggerFactory) VoidFunction(org.apache.spark.api.java.function.VoidFunction) StringUtils(org.apache.commons.lang3.StringUtils) StorageLevel(org.apache.spark.storage.StorageLevel) ClassPathXmlApplicationContext(org.springframework.context.support.ClassPathXmlApplicationContext) ContentDaoBaseInter(com.rainsoft.inter.ContentDaoBaseInter) BigDataConstants(com.rainsoft.BigDataConstants) JavaRDD(org.apache.spark.api.java.JavaRDD) ISecDaoBaseInter(com.rainsoft.inter.ISecDaoBaseInter) Logger(org.slf4j.Logger) RowFactory(org.apache.spark.sql.RowFactory) SparkConf(org.apache.spark.SparkConf) RowkeyColumnSecondarySort(com.rainsoft.hbase.RowkeyColumnSecondarySort) FileUtils(org.apache.commons.io.FileUtils) IOException(java.io.IOException) StopWatch(org.apache.commons.lang3.time.StopWatch) Row(org.apache.spark.sql.Row) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) Collectors(java.util.stream.Collectors) File(java.io.File) SolrClient(org.apache.solr.client.solrj.SolrClient) com.rainsoft.utils(com.rainsoft.utils) AbstractApplicationContext(org.springframework.context.support.AbstractApplicationContext) ConfigurationManager(com.rainsoft.conf.ConfigurationManager) FieldConstants(com.rainsoft.FieldConstants) Function(org.apache.spark.api.java.function.Function) SolrInputDocument(org.apache.solr.common.SolrInputDocument) SolrInputDocument(org.apache.solr.common.SolrInputDocument) Row(org.apache.spark.sql.Row)

Example 2 with VoidFunction

use of org.apache.spark.api.java.function.VoidFunction in project beijingThirdPeriod by weidongcao.

the class BcpImportHBaseSolrService method bcpWriteIntoSolr.

public static void bcpWriteIntoSolr(JavaRDD<String[]> javaRDD, TaskBean task) {
    logger.info("开始将 {} 的BCP数据索引到Solr", task.getContentType());
    /*
         * 数据写入Solr
         */
    javaRDD.foreachPartition((VoidFunction<Iterator<String[]>>) iterator -> {
        List<SolrInputDocument> list = new ArrayList<>();
        while (iterator.hasNext()) {
            String[] str = iterator.next();
            SolrInputDocument doc = new SolrInputDocument();
            String rowkey = str[0];
            doc.addField("ID", rowkey.split("_")[1]);
            doc.addField(BigDataConstants.SOLR_CONTENT_ID.toUpperCase(), rowkey);
            doc.addField(BigDataConstants.SOLR_DOC_TYPE_KEY, FieldConstants.DOC_TYPE_MAP.get(task.getContentType()));
            doc.addField(BigDataConstants.CAPTURE_TIME, rowkey.split("_")[0]);
            Date curDate = new Date();
            doc.addField("import_time".toUpperCase(), DateFormatUtils.DATE_TIME_FORMAT.format(curDate));
            doc.addField("import_time".toLowerCase(), curDate.getTime());
            for (int i = 1; i < str.length; i++) {
                String value = str[i];
                if (task.getColumns().length <= i - 1) {
                    break;
                }
                String key = task.getColumns()[i - 1].toUpperCase();
                if ((null != value) && (!"".equals(value))) {
                    doc.addField(key, value);
                }
            }
            list.add(doc);
        }
        if (list.size() > 0) {
            SolrUtil.setCloudSolrClientDefaultCollection(client, new Date());
            client.add(list, 1000);
            logger.info("---->写入Solr成功数据量:{}", list.size());
        } else {
            logger.info("{} 此Spark Partition 数据为空", task.getContentType());
        }
    });
    logger.info("####### {}的BCP数据索引Solr完成 #######", task.getContentType());
}
Also used : java.util(java.util) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LoggerFactory(org.slf4j.LoggerFactory) ArrayUtils(org.apache.commons.lang3.ArrayUtils) VoidFunction(org.apache.spark.api.java.function.VoidFunction) StringUtils(org.apache.commons.lang3.StringUtils) DateFormatUtils(com.rainsoft.utils.DateFormatUtils) StorageLevel(org.apache.spark.storage.StorageLevel) TaskBean(com.rainsoft.domain.TaskBean) ClassPathXmlApplicationContext(org.springframework.context.support.ClassPathXmlApplicationContext) BigDataConstants(com.rainsoft.BigDataConstants) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) FileUtils(com.rainsoft.utils.io.FileUtils) Logger(org.slf4j.Logger) SolrUtil(com.rainsoft.utils.SolrUtil) SparkConf(org.apache.spark.SparkConf) RowkeyColumnSecondarySort(com.rainsoft.hbase.RowkeyColumnSecondarySort) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SolrClient(org.apache.solr.client.solrj.SolrClient) HBaseUtils(com.rainsoft.utils.HBaseUtils) IOUtils(org.apache.commons.io.IOUtils) AbstractApplicationContext(org.springframework.context.support.AbstractApplicationContext) java.io(java.io) ConfigurationManager(com.rainsoft.conf.ConfigurationManager) FieldConstants(com.rainsoft.FieldConstants) Function(org.apache.spark.api.java.function.Function) SolrInputDocument(org.apache.solr.common.SolrInputDocument) SolrInputDocument(org.apache.solr.common.SolrInputDocument)

Example 3 with VoidFunction

use of org.apache.spark.api.java.function.VoidFunction in project beijingThirdPeriod by weidongcao.

the class TestSpark method main.

public static void main(String[] args) {
    String[] arr = FieldConstants.BCP_FILE_COLUMN_MAP.get("bcp_ftp");
    List<Integer[]> list = new ArrayList<>();
    Random rand = new Random();
    for (int i = 0; i < 9; i++) {
        Integer[] ints = new Integer[31];
        for (int j = 0; j < 31; j++) {
            ints[j] = rand.nextInt();
        }
        list.add(ints);
    }
    SparkSession spark = SparkSession.builder().appName(TestSpark.class.getSimpleName()).master("local").getOrCreate();
    JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
    JavaRDD<Integer[]> dataRDD = jsc.parallelize(list);
    JavaPairRDD<String, Integer> pairRDD = dataRDD.flatMapToPair((PairFlatMapFunction<Integer[], String, Integer>) ints -> {
        List<Tuple2<String, Integer>> list1 = new ArrayList<>();
        for (int i = 0; i < ints.length; i++) {
            String key = arr[i];
            Integer value = ints[i];
            list1.add(new Tuple2<>(key, value));
        }
        return list1.iterator();
    });
    pairRDD.foreach((VoidFunction<Tuple2<String, Integer>>) tuple -> System.out.println(tuple.toString()));
    jsc.close();
}
Also used : List(java.util.List) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) FieldConstants(com.rainsoft.FieldConstants) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Random(java.util.Random) VoidFunction(org.apache.spark.api.java.function.VoidFunction) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) ArrayList(java.util.ArrayList) SparkSession(org.apache.spark.sql.SparkSession) ArrayList(java.util.ArrayList) Random(java.util.Random) Tuple2(scala.Tuple2) List(java.util.List) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 4 with VoidFunction

use of org.apache.spark.api.java.function.VoidFunction in project beijingThirdPeriod by weidongcao.

the class TestHbase method main.

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("aaa").setMaster("local");
    JavaSparkContext sc = new JavaSparkContext(conf);
    Random random = new Random();
    List<String> list = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
        int[] aaa = new int[4];
        for (int j = 0; j < 4; j++) {
            aaa[j] = random.nextInt(900) + 100;
        }
        list.add(StringUtils.join(aaa, ","));
    }
    JavaRDD<String> originalRDD = sc.parallelize(list);
    originalRDD.foreach((VoidFunction<String>) s -> System.out.println(s));
    /*JavaPairRDD<RowkeyColumnSecondarySort, String> hfileRDD = originalRDD.flatMapToPair(
                new PairFlatMapFunction<String, RowkeyColumnSecondarySort, String>() {
                    @Override
                    public Iterable<Tuple2<RowkeyColumnSecondarySort, String>> call(String s) throws Exception {
                        String[] aa = s.split(",");

                    }
                }
        )*/
    sc.close();
}
Also used : List(java.util.List) SparkConf(org.apache.spark.SparkConf) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Random(java.util.Random) VoidFunction(org.apache.spark.api.java.function.VoidFunction) StringUtils(org.apache.commons.lang3.StringUtils) JavaRDD(org.apache.spark.api.java.JavaRDD) ArrayList(java.util.ArrayList) Random(java.util.Random) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Example 5 with VoidFunction

use of org.apache.spark.api.java.function.VoidFunction in project net.jgp.labs.spark by jgperrin.

the class StreamingIngestionFileSystemTextFileToDataframeApp method start.

private void start() {
    // Create a local StreamingContext with two working thread and batch
    // interval of
    // 1 second
    SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("Streaming Ingestion File System Text File to Dataframe");
    JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
    JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils.getInputDirectory());
    msgDataStream.print();
    // Create JavaRDD<Row>
    msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {

        private static final long serialVersionUID = -590010339928376829L;

        @Override
        public void call(JavaRDD<String> rdd) {
            JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {

                private static final long serialVersionUID = 5167089361335095997L;

                @Override
                public Row call(String msg) {
                    Row row = RowFactory.create(msg);
                    return row;
                }
            });
            // Create Schema
            StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) });
            // Get Spark 2.0 session
            SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
            Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
            msgDataFrame.show();
        }
    });
    jssc.start();
    try {
        jssc.awaitTermination();
    } catch (InterruptedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) Dataset(org.apache.spark.sql.Dataset) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) VoidFunction(org.apache.spark.api.java.function.VoidFunction) Function(org.apache.spark.api.java.function.Function) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Aggregations

VoidFunction (org.apache.spark.api.java.function.VoidFunction)9 SparkConf (org.apache.spark.SparkConf)8 JavaRDD (org.apache.spark.api.java.JavaRDD)8 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)6 Function (org.apache.spark.api.java.function.Function)6 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)5 FieldConstants (com.rainsoft.FieldConstants)4 ArrayList (java.util.ArrayList)4 Tuple2 (scala.Tuple2)4 BigDataConstants (com.rainsoft.BigDataConstants)3 RowkeyColumnSecondarySort (com.rainsoft.hbase.RowkeyColumnSecondarySort)3 List (java.util.List)3 StringUtils (org.apache.commons.lang3.StringUtils)3 SolrClient (org.apache.solr.client.solrj.SolrClient)3 SolrInputDocument (org.apache.solr.common.SolrInputDocument)3 JavaStreamingContext (org.apache.spark.streaming.api.java.JavaStreamingContext)3 ConfigurationManager (com.rainsoft.conf.ConfigurationManager)2 TaskBean (com.rainsoft.domain.TaskBean)2 DateFormatUtils (com.rainsoft.utils.DateFormatUtils)2 HBaseUtils (com.rainsoft.utils.HBaseUtils)2