use of org.apache.spark.api.java.function.FlatMapFunction in project beijingThirdPeriod by weidongcao.
the class SparkExportToHBase method main.
public static void main(String[] args) throws Exception {
// 作业类型
String taskType = args[0];
// hdfs数据临时存储根目录
String hdfsDataPath = args[1];
SparkConf conf = new SparkConf().setAppName(SparkExportToHBase.class.getName());
JavaSparkContext sc = new JavaSparkContext(conf);
// oracle表名
String tableName = NamingRuleUtils.getOracleContentTableName(taskType);
// 列簇名
String cf = NamingRuleUtils.getHBaseContentTableCF();
// HFile的HDFS临时存储目录
String tempHDFSPath = NamingRuleUtils.getHFileTaskDir(NamingRuleUtils.getOracleContentTableName(taskType));
InputStream in = SparkExportToHBase.class.getClassLoader().getResourceAsStream("metadata/" + tableName.toLowerCase());
String[] fieldNames = IOUtils.toString(in, "utf-8").split("\r\n");
JavaRDD<String> originalRDD = sc.textFile(hdfsDataPath);
JavaRDD<String[]> fieldRDD = originalRDD.mapPartitions((FlatMapFunction<Iterator<String>, String[]>) iter -> {
List<String[]> list = new ArrayList<>();
while (iter.hasNext()) {
String str = iter.next();
String[] fields = str.split("\t");
list.add(fields);
}
return list.iterator();
});
/*
* 数据转换为HBase的HFile格式
*/
JavaPairRDD<RowkeyColumnSecondarySort, String> hbasePairRDD = originalRDD.flatMapToPair((PairFlatMapFunction<String, RowkeyColumnSecondarySort, String>) (String line) -> {
List<Tuple2<RowkeyColumnSecondarySort, String>> list = new ArrayList<>();
String[] cols = line.split("\t");
String rowkey = cols[0];
for (int i = 1; i < cols.length; i++) {
String value = cols[i];
if ((null != value) && (!"".equals(cols[i]))) {
list.add(new Tuple2<>(new RowkeyColumnSecondarySort(rowkey, fieldNames[i]), value));
}
}
return list.iterator();
}).sortByKey();
/*
* Spark将HFile文件写HDFS并转存入HBase
*/
HBaseUtils.writeData2HBase(hbasePairRDD, "H_" + tableName, cf, tempHDFSPath);
logger.info("写入HBase完成");
sc.close();
}
use of org.apache.spark.api.java.function.FlatMapFunction in project incubator-sdap-mudrod by apache.
the class CrawlerDetection method checkByRateInParallel.
void checkByRateInParallel() throws InterruptedException, IOException {
JavaRDD<String> userRDD = getUserRDD(this.httpType);
LOG.info("Original User count: {}", userRDD.count());
int userCount = 0;
userCount = userRDD.mapPartitions((FlatMapFunction<Iterator<String>, Integer>) iterator -> {
ESDriver tmpES = new ESDriver(props);
tmpES.createBulkProcessor();
List<Integer> realUserNums = new ArrayList<>();
while (iterator.hasNext()) {
String s = iterator.next();
Integer realUser = checkByRate(tmpES, s);
realUserNums.add(realUser);
}
tmpES.destroyBulkProcessor();
tmpES.close();
return realUserNums.iterator();
}).reduce((Function2<Integer, Integer, Integer>) (a, b) -> a + b);
LOG.info("User count: {}", Integer.toString(userCount));
}
use of org.apache.spark.api.java.function.FlatMapFunction in project java_study by aloyschen.
the class RDD method create_RDD.
/*
* 创建一个RDD
* @return 包含列表数据的RDD
*/
public void create_RDD() {
JavaSparkContext sc = getSc();
sc.setLogLevel("ERROR");
JavaRDD<String> sentences = sc.parallelize(Arrays.asList("I am learning", "and you"));
JavaRDD<String> test2 = sentences.map(line -> {
if (line.contains("error")) {
System.out.println("this has error");
System.out.println(line);
}
return line;
});
System.out.println(test2.count());
JavaRDD<String> words = sentences.flatMap((FlatMapFunction<String, String>) line -> Arrays.asList(line.split(" ")).iterator());
System.out.println(words.take(5));
// test.foreach(System.out::println);
}
use of org.apache.spark.api.java.function.FlatMapFunction in project calcite by apache.
the class SparkRules method main.
// Play area
public static void main(String[] args) {
final JavaSparkContext sc = new JavaSparkContext("local[1]", "calcite");
final JavaRDD<String> file = sc.textFile("/usr/share/dict/words");
System.out.println(file.map(new Function<String, Object>() {
@Override
public Object call(String s) throws Exception {
return s.substring(0, Math.min(s.length(), 1));
}
}).distinct().count());
file.cache();
String s = file.groupBy(new Function<String, String>() {
@Override
public String call(String s) throws Exception {
return s.substring(0, Math.min(s.length(), 1));
}
}).map(new Function<Tuple2<String, Iterable<String>>, Object>() {
@Override
public Object call(Tuple2<String, Iterable<String>> pair) {
return pair._1() + ":" + Iterables.size(pair._2());
}
}).collect().toString();
System.out.print(s);
final JavaRDD<Integer> rdd = sc.parallelize(new AbstractList<Integer>() {
final Random random = new Random();
@Override
public Integer get(int index) {
System.out.println("get(" + index + ")");
return random.nextInt(100);
}
@Override
public int size() {
System.out.println("size");
return 10;
}
});
System.out.println(rdd.groupBy(new Function<Integer, Integer>() {
public Integer call(Integer integer) {
return integer % 2;
}
}).collect().toString());
System.out.println(file.flatMap(new FlatMapFunction<String, Pair<String, Integer>>() {
public Iterator<Pair<String, Integer>> call(String x) {
if (!x.startsWith("a")) {
return Collections.emptyIterator();
}
return Collections.singletonList(Pair.of(x.toUpperCase(Locale.ROOT), x.length())).iterator();
}
}).take(5).toString());
}
use of org.apache.spark.api.java.function.FlatMapFunction in project learning-spark by databricks.
the class BasicAvgMapPartitions method run.
public void run(String master) {
JavaSparkContext sc = new JavaSparkContext(master, "basicavgmappartitions", System.getenv("SPARK_HOME"), System.getenv("JARS"));
JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
FlatMapFunction<Iterator<Integer>, AvgCount> setup = new FlatMapFunction<Iterator<Integer>, AvgCount>() {
@Override
public Iterable<AvgCount> call(Iterator<Integer> input) {
AvgCount a = new AvgCount(0, 0);
while (input.hasNext()) {
a.total_ += input.next();
a.num_ += 1;
}
ArrayList<AvgCount> ret = new ArrayList<AvgCount>();
ret.add(a);
return ret;
}
};
Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() {
@Override
public AvgCount call(AvgCount a, AvgCount b) {
a.total_ += b.total_;
a.num_ += b.num_;
return a;
}
};
AvgCount result = rdd.mapPartitions(setup).reduce(combine);
System.out.println(result.avg());
}
Aggregations