use of org.apache.spark.api.java.JavaPairRDD in project mm-dev by sbl-sdsc.
the class TestRosettaMmtf method main.
/**
* Test: Read MMTF-Hadoop Sequence file.
*
* @param args args[0] <path-to-mmtf-haddop-sequence-file>
*
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
// instantiate Spark
// TODO set to local[1] !!!!
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("TestSwissModelMmtf");
JavaSparkContext sc = new JavaSparkContext(conf);
long start = System.nanoTime();
// read PDB files recursively starting the specified directory
JavaPairRDD<String, StructureDataInterface> structures = MmtfReader.readSequenceFile(args[0], sc);
// total: 639 structures
// structures = structures.filter(new ContainsDnaChain()); // ?
// structures = structures.filter(new ContainsLProteinChain()); // 639?
// structures = structures.filter(new ContainsGroup("ZN")); // 0
// structures = structures.filter(new ContainsGroup("ATP")); //
// debug: print structure data
// structures.foreach(t -> TraverseStructureHierarchy.demo(t._2));
// structures.foreach(t -> System.out.println(t._1));
System.out.println(structures.map(t -> t._2.getNumEntities()).reduce((a, b) -> a + b));
System.out.println("Number of structures read: " + structures.count());
long end = System.nanoTime();
System.out.println("Time: " + (end - start) / 1E9 + " sec.");
// close Spark
sc.close();
}
use of org.apache.spark.api.java.JavaPairRDD in project beijingThirdPeriod by weidongcao.
the class TestSpark method main.
public static void main(String[] args) {
String[] arr = FieldConstants.BCP_FILE_COLUMN_MAP.get("bcp_ftp");
List<Integer[]> list = new ArrayList<>();
Random rand = new Random();
for (int i = 0; i < 9; i++) {
Integer[] ints = new Integer[31];
for (int j = 0; j < 31; j++) {
ints[j] = rand.nextInt();
}
list.add(ints);
}
SparkSession spark = SparkSession.builder().appName(TestSpark.class.getSimpleName()).master("local").getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Integer[]> dataRDD = jsc.parallelize(list);
JavaPairRDD<String, Integer> pairRDD = dataRDD.flatMapToPair((PairFlatMapFunction<Integer[], String, Integer>) ints -> {
List<Tuple2<String, Integer>> list1 = new ArrayList<>();
for (int i = 0; i < ints.length; i++) {
String key = arr[i];
Integer value = ints[i];
list1.add(new Tuple2<>(key, value));
}
return list1.iterator();
});
pairRDD.foreach((VoidFunction<Tuple2<String, Integer>>) tuple -> System.out.println(tuple.toString()));
jsc.close();
}
use of org.apache.spark.api.java.JavaPairRDD in project beijingThirdPeriod by weidongcao.
the class SparkExportToHBase method main.
public static void main(String[] args) throws Exception {
// 作业类型
String taskType = args[0];
// hdfs数据临时存储根目录
String hdfsDataPath = args[1];
SparkConf conf = new SparkConf().setAppName(SparkExportToHBase.class.getName());
JavaSparkContext sc = new JavaSparkContext(conf);
// oracle表名
String tableName = NamingRuleUtils.getOracleContentTableName(taskType);
// 列簇名
String cf = NamingRuleUtils.getHBaseContentTableCF();
// HFile的HDFS临时存储目录
String tempHDFSPath = NamingRuleUtils.getHFileTaskDir(NamingRuleUtils.getOracleContentTableName(taskType));
InputStream in = SparkExportToHBase.class.getClassLoader().getResourceAsStream("metadata/" + tableName.toLowerCase());
String[] fieldNames = IOUtils.toString(in, "utf-8").split("\r\n");
JavaRDD<String> originalRDD = sc.textFile(hdfsDataPath);
JavaRDD<String[]> fieldRDD = originalRDD.mapPartitions((FlatMapFunction<Iterator<String>, String[]>) iter -> {
List<String[]> list = new ArrayList<>();
while (iter.hasNext()) {
String str = iter.next();
String[] fields = str.split("\t");
list.add(fields);
}
return list.iterator();
});
/*
* 数据转换为HBase的HFile格式
*/
JavaPairRDD<RowkeyColumnSecondarySort, String> hbasePairRDD = originalRDD.flatMapToPair((PairFlatMapFunction<String, RowkeyColumnSecondarySort, String>) (String line) -> {
List<Tuple2<RowkeyColumnSecondarySort, String>> list = new ArrayList<>();
String[] cols = line.split("\t");
String rowkey = cols[0];
for (int i = 1; i < cols.length; i++) {
String value = cols[i];
if ((null != value) && (!"".equals(cols[i]))) {
list.add(new Tuple2<>(new RowkeyColumnSecondarySort(rowkey, fieldNames[i]), value));
}
}
return list.iterator();
}).sortByKey();
/*
* Spark将HFile文件写HDFS并转存入HBase
*/
HBaseUtils.writeData2HBase(hbasePairRDD, "H_" + tableName, cf, tempHDFSPath);
logger.info("写入HBase完成");
sc.close();
}
use of org.apache.spark.api.java.JavaPairRDD in project java_study by aloyschen.
the class GbdtAndLr method train.
/*
* 获取GBDT模型组合后的特征输入到Lr模型中,训练LR模型
* @param Path: 训练数据路径
*/
public void train(String Path) {
JavaSparkContext jsc = getSc();
ArrayList<ArrayList<Integer>> treeLeafArray = new ArrayList<>();
Dataset<Row> all_data = Preprocessing(jsc, Path);
JavaRDD<LabeledPoint> gbdt_data_labelpoint = load_gbdt_data(all_data);
GradientBoostedTreesModel gbdt = train_gbdt(jsc, gbdt_data_labelpoint);
DecisionTreeModel[] decisionTreeModels = gbdt.trees();
// 获取GBDT每棵树的叶子索引
for (int i = 0; i < this.maxIter; i++) {
treeLeafArray.add(getLeafNodes(decisionTreeModels[i].topNode()));
// System.out.println("叶子索引");
// System.out.println(treeLeafArray.get(i));
}
JavaRDD<LabeledPoint> CombineFeatures = all_data.toJavaRDD().map(line -> {
double[] newvaluesDouble;
double[] features = new double[24];
// 将dataset中每列特征值放入DenseVector中
for (Integer i = 6; i < 18; i++) {
org.apache.spark.mllib.linalg.DenseVector den = null;
if (line.get(i) instanceof org.apache.spark.ml.linalg.Vector) {
den = (DenseVector) Vectors.fromML((org.apache.spark.ml.linalg.DenseVector) line.get(i));
features[i - 6] = den.toArray()[0];
} else {
features[i - 6] = Double.parseDouble(line.get(i).toString());
}
}
DenseVector numerical_vector = new DenseVector(features);
ArrayList<Double> newvaluesArray = new ArrayList<>();
for (int i = 0; i < this.maxIter; i++) {
int treePredict = predictModify(decisionTreeModels[i].topNode(), numerical_vector);
int len = treeLeafArray.get(i).size();
ArrayList<Double> treeArray = new ArrayList<>(len);
// 数组所有值初始化为0,落在的叶子节点至为1
for (int j = 0; j < len; j++) treeArray.add(j, 0d);
treeArray.set(treeLeafArray.get(i).indexOf(treePredict), 1d);
newvaluesArray.addAll(treeArray);
}
for (int i = 18; i < 29; i++) {
SparseVector onehot_data = (SparseVector) Vectors.fromML((org.apache.spark.ml.linalg.SparseVector) line.get(i));
DenseVector cat_data = onehot_data.toDense();
for (int j = 0; j < cat_data.size(); j++) {
newvaluesArray.add(cat_data.apply(j));
}
}
newvaluesDouble = newvaluesArray.stream().mapToDouble(Double::doubleValue).toArray();
DenseVector newdenseVector = new DenseVector(newvaluesDouble);
return (new LabeledPoint(Double.valueOf(line.get(1).toString()), newdenseVector));
});
JavaRDD<LabeledPoint>[] splitsLR = CombineFeatures.randomSplit(new double[] { 0.7, 0.3 });
JavaRDD<LabeledPoint> trainingDataLR = splitsLR[0];
JavaRDD<LabeledPoint> testDataLR = splitsLR[1];
System.out.println("Start train LR");
LogisticRegressionModel LR = new LogisticRegressionWithLBFGS().setNumClasses(2).run(trainingDataLR.rdd()).clearThreshold();
System.out.println("modelLR.weights().size():" + LR.weights().size());
JavaPairRDD<Object, Object> test_LR = testDataLR.mapToPair((PairFunction<LabeledPoint, Object, Object>) labeledPoint -> {
Tuple2<Object, Object> tuple2 = new Tuple2<>(LR.predict(labeledPoint.features()), labeledPoint.label());
return tuple2;
});
BinaryClassificationMetrics test_metrics = new BinaryClassificationMetrics(test_LR.rdd());
double test_auc = test_metrics.areaUnderROC();
System.out.println("test data auc_score:" + test_auc);
JavaPairRDD<Object, Object> train_LR = trainingDataLR.mapToPair((PairFunction<LabeledPoint, Object, Object>) labeledPoint -> {
Tuple2<Object, Object> tuple2 = new Tuple2<>(LR.predict(labeledPoint.features()), labeledPoint.label());
return tuple2;
});
BinaryClassificationMetrics train_metrics = new BinaryClassificationMetrics(train_LR.rdd());
double train_auc = train_metrics.areaUnderROC();
System.out.println("train data auc_score:" + train_auc);
// 不同阈值下的精确度排序,取前十个输出
JavaRDD<Tuple2<Object, Object>> precision = train_metrics.precisionByThreshold().toJavaRDD();
JavaPairRDD<Object, Object> temp = JavaPairRDD.fromJavaRDD(precision);
JavaPairRDD<Object, Object> swap = temp.mapToPair(Tuple2::swap);
JavaPairRDD<Object, Object> precision_sort = swap.sortByKey(false);
System.out.println("Precision by threshold: (Precision, Threshold)");
for (int i = 0; i < 10; i++) {
System.out.println(precision_sort.take(10).toArray()[i]);
}
}
use of org.apache.spark.api.java.JavaPairRDD in project mmtf-spark by sbl-sdsc.
the class MapReduceExample method main.
/**
* Counts the number of atoms in the PDB using the classic
* map-reduce algorithm
*
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(MapReduceExample.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read PDB from MMTF-Hadoop sequence file
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readFullSequenceFile(sc);
// count number of atoms
long numAtoms = pdb.map(t -> t._2.getNumAtoms()).reduce((a, b) -> a + b);
System.out.println("Total number of atoms in PDB: " + numAtoms);
long end = System.nanoTime();
System.out.println("Time: " + (end - start) / 1E9 + " sec.");
sc.close();
}
Aggregations