Search in sources :

Example 36 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project mm-dev by sbl-sdsc.

the class TestRosettaMmtf method main.

/**
 * Test: Read MMTF-Hadoop Sequence file.
 *
 * @param args args[0] <path-to-mmtf-haddop-sequence-file>
 *
 * @throws FileNotFoundException
 */
public static void main(String[] args) throws FileNotFoundException {
    // instantiate Spark
    // TODO set to local[1] !!!!
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("TestSwissModelMmtf");
    JavaSparkContext sc = new JavaSparkContext(conf);
    long start = System.nanoTime();
    // read PDB files recursively starting the specified directory
    JavaPairRDD<String, StructureDataInterface> structures = MmtfReader.readSequenceFile(args[0], sc);
    // total:  639 structures
    // structures = structures.filter(new ContainsDnaChain()); //  ?
    // structures = structures.filter(new ContainsLProteinChain()); // 639?
    // structures = structures.filter(new ContainsGroup("ZN")); // 0
    // structures = structures.filter(new ContainsGroup("ATP")); //
    // debug: print structure data
    // structures.foreach(t -> TraverseStructureHierarchy.demo(t._2));
    // structures.foreach(t -> System.out.println(t._1));
    System.out.println(structures.map(t -> t._2.getNumEntities()).reduce((a, b) -> a + b));
    System.out.println("Number of structures read: " + structures.count());
    long end = System.nanoTime();
    System.out.println("Time: " + (end - start) / 1E9 + " sec.");
    // close Spark
    sc.close();
}
Also used : MmtfImporter(edu.sdsc.mmtf.spark.io.MmtfImporter) Arrays(java.util.Arrays) ContainsLProteinChain(edu.sdsc.mmtf.spark.filters.ContainsLProteinChain) SparkConf(org.apache.spark.SparkConf) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) MmtfWriter(edu.sdsc.mmtf.spark.io.MmtfWriter) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) FileNotFoundException(java.io.FileNotFoundException) ContainsGroup(edu.sdsc.mmtf.spark.filters.ContainsGroup) TraverseStructureHierarchy(edu.sdsc.mmtf.spark.io.demos.TraverseStructureHierarchy) List(java.util.List) ContainsDnaChain(edu.sdsc.mmtf.spark.filters.ContainsDnaChain) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) ContainsDProteinChain(edu.sdsc.mmtf.spark.filters.ContainsDProteinChain) MmtfReader(edu.sdsc.mmtf.spark.io.MmtfReader) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) SparkConf(org.apache.spark.SparkConf)

Example 37 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project beijingThirdPeriod by weidongcao.

the class TestSpark method main.

public static void main(String[] args) {
    String[] arr = FieldConstants.BCP_FILE_COLUMN_MAP.get("bcp_ftp");
    List<Integer[]> list = new ArrayList<>();
    Random rand = new Random();
    for (int i = 0; i < 9; i++) {
        Integer[] ints = new Integer[31];
        for (int j = 0; j < 31; j++) {
            ints[j] = rand.nextInt();
        }
        list.add(ints);
    }
    SparkSession spark = SparkSession.builder().appName(TestSpark.class.getSimpleName()).master("local").getOrCreate();
    JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
    JavaRDD<Integer[]> dataRDD = jsc.parallelize(list);
    JavaPairRDD<String, Integer> pairRDD = dataRDD.flatMapToPair((PairFlatMapFunction<Integer[], String, Integer>) ints -> {
        List<Tuple2<String, Integer>> list1 = new ArrayList<>();
        for (int i = 0; i < ints.length; i++) {
            String key = arr[i];
            Integer value = ints[i];
            list1.add(new Tuple2<>(key, value));
        }
        return list1.iterator();
    });
    pairRDD.foreach((VoidFunction<Tuple2<String, Integer>>) tuple -> System.out.println(tuple.toString()));
    jsc.close();
}
Also used : List(java.util.List) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) FieldConstants(com.rainsoft.FieldConstants) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Random(java.util.Random) VoidFunction(org.apache.spark.api.java.function.VoidFunction) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) ArrayList(java.util.ArrayList) SparkSession(org.apache.spark.sql.SparkSession) ArrayList(java.util.ArrayList) Random(java.util.Random) Tuple2(scala.Tuple2) List(java.util.List) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 38 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project beijingThirdPeriod by weidongcao.

the class SparkExportToHBase method main.

public static void main(String[] args) throws Exception {
    // 作业类型
    String taskType = args[0];
    // hdfs数据临时存储根目录
    String hdfsDataPath = args[1];
    SparkConf conf = new SparkConf().setAppName(SparkExportToHBase.class.getName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // oracle表名
    String tableName = NamingRuleUtils.getOracleContentTableName(taskType);
    // 列簇名
    String cf = NamingRuleUtils.getHBaseContentTableCF();
    // HFile的HDFS临时存储目录
    String tempHDFSPath = NamingRuleUtils.getHFileTaskDir(NamingRuleUtils.getOracleContentTableName(taskType));
    InputStream in = SparkExportToHBase.class.getClassLoader().getResourceAsStream("metadata/" + tableName.toLowerCase());
    String[] fieldNames = IOUtils.toString(in, "utf-8").split("\r\n");
    JavaRDD<String> originalRDD = sc.textFile(hdfsDataPath);
    JavaRDD<String[]> fieldRDD = originalRDD.mapPartitions((FlatMapFunction<Iterator<String>, String[]>) iter -> {
        List<String[]> list = new ArrayList<>();
        while (iter.hasNext()) {
            String str = iter.next();
            String[] fields = str.split("\t");
            list.add(fields);
        }
        return list.iterator();
    });
    /*
         * 数据转换为HBase的HFile格式
         */
    JavaPairRDD<RowkeyColumnSecondarySort, String> hbasePairRDD = originalRDD.flatMapToPair((PairFlatMapFunction<String, RowkeyColumnSecondarySort, String>) (String line) -> {
        List<Tuple2<RowkeyColumnSecondarySort, String>> list = new ArrayList<>();
        String[] cols = line.split("\t");
        String rowkey = cols[0];
        for (int i = 1; i < cols.length; i++) {
            String value = cols[i];
            if ((null != value) && (!"".equals(cols[i]))) {
                list.add(new Tuple2<>(new RowkeyColumnSecondarySort(rowkey, fieldNames[i]), value));
            }
        }
        return list.iterator();
    }).sortByKey();
    /*
         * Spark将HFile文件写HDFS并转存入HBase
         */
    HBaseUtils.writeData2HBase(hbasePairRDD, "H_" + tableName, cf, tempHDFSPath);
    logger.info("写入HBase完成");
    sc.close();
}
Also used : Logger(org.slf4j.Logger) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) Iterator(java.util.Iterator) SparkConf(org.apache.spark.SparkConf) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LoggerFactory(org.slf4j.LoggerFactory) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) ArrayList(java.util.ArrayList) HBaseUtils(com.rainsoft.utils.HBaseUtils) IOUtils(org.apache.commons.io.IOUtils) List(java.util.List) NamingRuleUtils(com.rainsoft.utils.NamingRuleUtils) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) InputStream(java.io.InputStream) InputStream(java.io.InputStream) ArrayList(java.util.ArrayList) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) Tuple2(scala.Tuple2) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) List(java.util.List) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Example 39 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project java_study by aloyschen.

the class GbdtAndLr method train.

/*
    * 获取GBDT模型组合后的特征输入到Lr模型中,训练LR模型
    * @param Path: 训练数据路径
     */
public void train(String Path) {
    JavaSparkContext jsc = getSc();
    ArrayList<ArrayList<Integer>> treeLeafArray = new ArrayList<>();
    Dataset<Row> all_data = Preprocessing(jsc, Path);
    JavaRDD<LabeledPoint> gbdt_data_labelpoint = load_gbdt_data(all_data);
    GradientBoostedTreesModel gbdt = train_gbdt(jsc, gbdt_data_labelpoint);
    DecisionTreeModel[] decisionTreeModels = gbdt.trees();
    // 获取GBDT每棵树的叶子索引
    for (int i = 0; i < this.maxIter; i++) {
        treeLeafArray.add(getLeafNodes(decisionTreeModels[i].topNode()));
    // System.out.println("叶子索引");
    // System.out.println(treeLeafArray.get(i));
    }
    JavaRDD<LabeledPoint> CombineFeatures = all_data.toJavaRDD().map(line -> {
        double[] newvaluesDouble;
        double[] features = new double[24];
        // 将dataset中每列特征值放入DenseVector中
        for (Integer i = 6; i < 18; i++) {
            org.apache.spark.mllib.linalg.DenseVector den = null;
            if (line.get(i) instanceof org.apache.spark.ml.linalg.Vector) {
                den = (DenseVector) Vectors.fromML((org.apache.spark.ml.linalg.DenseVector) line.get(i));
                features[i - 6] = den.toArray()[0];
            } else {
                features[i - 6] = Double.parseDouble(line.get(i).toString());
            }
        }
        DenseVector numerical_vector = new DenseVector(features);
        ArrayList<Double> newvaluesArray = new ArrayList<>();
        for (int i = 0; i < this.maxIter; i++) {
            int treePredict = predictModify(decisionTreeModels[i].topNode(), numerical_vector);
            int len = treeLeafArray.get(i).size();
            ArrayList<Double> treeArray = new ArrayList<>(len);
            // 数组所有值初始化为0,落在的叶子节点至为1
            for (int j = 0; j < len; j++) treeArray.add(j, 0d);
            treeArray.set(treeLeafArray.get(i).indexOf(treePredict), 1d);
            newvaluesArray.addAll(treeArray);
        }
        for (int i = 18; i < 29; i++) {
            SparseVector onehot_data = (SparseVector) Vectors.fromML((org.apache.spark.ml.linalg.SparseVector) line.get(i));
            DenseVector cat_data = onehot_data.toDense();
            for (int j = 0; j < cat_data.size(); j++) {
                newvaluesArray.add(cat_data.apply(j));
            }
        }
        newvaluesDouble = newvaluesArray.stream().mapToDouble(Double::doubleValue).toArray();
        DenseVector newdenseVector = new DenseVector(newvaluesDouble);
        return (new LabeledPoint(Double.valueOf(line.get(1).toString()), newdenseVector));
    });
    JavaRDD<LabeledPoint>[] splitsLR = CombineFeatures.randomSplit(new double[] { 0.7, 0.3 });
    JavaRDD<LabeledPoint> trainingDataLR = splitsLR[0];
    JavaRDD<LabeledPoint> testDataLR = splitsLR[1];
    System.out.println("Start train LR");
    LogisticRegressionModel LR = new LogisticRegressionWithLBFGS().setNumClasses(2).run(trainingDataLR.rdd()).clearThreshold();
    System.out.println("modelLR.weights().size():" + LR.weights().size());
    JavaPairRDD<Object, Object> test_LR = testDataLR.mapToPair((PairFunction<LabeledPoint, Object, Object>) labeledPoint -> {
        Tuple2<Object, Object> tuple2 = new Tuple2<>(LR.predict(labeledPoint.features()), labeledPoint.label());
        return tuple2;
    });
    BinaryClassificationMetrics test_metrics = new BinaryClassificationMetrics(test_LR.rdd());
    double test_auc = test_metrics.areaUnderROC();
    System.out.println("test data auc_score:" + test_auc);
    JavaPairRDD<Object, Object> train_LR = trainingDataLR.mapToPair((PairFunction<LabeledPoint, Object, Object>) labeledPoint -> {
        Tuple2<Object, Object> tuple2 = new Tuple2<>(LR.predict(labeledPoint.features()), labeledPoint.label());
        return tuple2;
    });
    BinaryClassificationMetrics train_metrics = new BinaryClassificationMetrics(train_LR.rdd());
    double train_auc = train_metrics.areaUnderROC();
    System.out.println("train data auc_score:" + train_auc);
    // 不同阈值下的精确度排序,取前十个输出
    JavaRDD<Tuple2<Object, Object>> precision = train_metrics.precisionByThreshold().toJavaRDD();
    JavaPairRDD<Object, Object> temp = JavaPairRDD.fromJavaRDD(precision);
    JavaPairRDD<Object, Object> swap = temp.mapToPair(Tuple2::swap);
    JavaPairRDD<Object, Object> precision_sort = swap.sortByKey(false);
    System.out.println("Precision by threshold: (Precision, Threshold)");
    for (int i = 0; i < 10; i++) {
        System.out.println(precision_sort.take(10).toArray()[i]);
    }
}
Also used : Vectors(org.apache.spark.mllib.linalg.Vectors) java.util(java.util) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) Serializable(scala.Serializable) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) BinaryClassificationMetrics(org.apache.spark.mllib.evaluation.BinaryClassificationMetrics) CmdlineParser(de.tototec.cmdoption.CmdlineParser) SparseVector(org.apache.spark.mllib.linalg.SparseVector) LogisticRegressionModel(org.apache.spark.mllib.classification.LogisticRegressionModel) utils(utils) JavaRDD(org.apache.spark.api.java.JavaRDD) FeatureType(org.apache.spark.mllib.tree.configuration.FeatureType) DateFormat(java.text.DateFormat) DataTypes(org.apache.spark.sql.types.DataTypes) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) DenseVector(org.apache.spark.mllib.linalg.DenseVector) GradientBoostedTrees(org.apache.spark.mllib.tree.GradientBoostedTrees) SparkConf(org.apache.spark.SparkConf) LogisticRegressionWithLBFGS(org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) Option(scala.Option) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) org.apache.spark.ml.feature(org.apache.spark.ml.feature) org.apache.spark.mllib.tree.model(org.apache.spark.mllib.tree.model) org.apache.spark.sql(org.apache.spark.sql) JavaConverters(scala.collection.JavaConverters) BoostingStrategy(org.apache.spark.mllib.tree.configuration.BoostingStrategy) PairFunction(org.apache.spark.api.java.function.PairFunction) LogisticRegressionModel(org.apache.spark.mllib.classification.LogisticRegressionModel) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) SparseVector(org.apache.spark.mllib.linalg.SparseVector) LogisticRegressionWithLBFGS(org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparseVector(org.apache.spark.mllib.linalg.SparseVector) DenseVector(org.apache.spark.mllib.linalg.DenseVector) DenseVector(org.apache.spark.mllib.linalg.DenseVector) BinaryClassificationMetrics(org.apache.spark.mllib.evaluation.BinaryClassificationMetrics) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) JavaRDD(org.apache.spark.api.java.JavaRDD) Tuple2(scala.Tuple2) DenseVector(org.apache.spark.mllib.linalg.DenseVector)

Example 40 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project mmtf-spark by sbl-sdsc.

the class MapReduceExample method main.

/**
 * Counts the number of atoms in the PDB using the classic
 * map-reduce algorithm
 *
 * @throws FileNotFoundException
 */
public static void main(String[] args) throws FileNotFoundException {
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(MapReduceExample.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read PDB from MMTF-Hadoop sequence file
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readFullSequenceFile(sc);
    // count number of atoms
    long numAtoms = pdb.map(t -> t._2.getNumAtoms()).reduce((a, b) -> a + b);
    System.out.println("Total number of atoms in PDB: " + numAtoms);
    long end = System.nanoTime();
    System.out.println("Time: " + (end - start) / 1E9 + " sec.");
    sc.close();
}
Also used : StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) SparkConf(org.apache.spark.SparkConf) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) MmtfReader(edu.sdsc.mmtf.spark.io.MmtfReader) FileNotFoundException(java.io.FileNotFoundException) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) SparkConf(org.apache.spark.SparkConf)

Aggregations

JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)99 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)44 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)42 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)42 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)41 Tuple2 (scala.Tuple2)35 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)33 JavaRDD (org.apache.spark.api.java.JavaRDD)28 List (java.util.List)27 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)24 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)23 Collectors (java.util.stream.Collectors)22 IOException (java.io.IOException)17 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)16 LongWritable (org.apache.hadoop.io.LongWritable)15 Broadcast (org.apache.spark.broadcast.Broadcast)15 Text (org.apache.hadoop.io.Text)12 UserException (org.broadinstitute.hellbender.exceptions.UserException)12 Function (org.apache.spark.api.java.function.Function)11 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)11