use of org.apache.spark.sql.Dataset in project pancm_project by xuwujing.
the class sparkSqlTest method main.
public static void main(String[] args) throws Exception {
System.out.println("开始...");
// System.setProperty("hadoop.home.dir", "E:\\hadoop");
// System.setProperty("HADOOP_USER_NAME", "root");
// System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
SparkSession spark = SparkSession.builder().appName("lcc_java_read_hbase_register_to_table").master("local[*]").getOrCreate();
JavaSparkContext context = new JavaSparkContext(spark.sparkContext());
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.property.clientPort", "2181");
conf.set("hbase.zookeeper.quorum", "192.169.0.25");
Scan scan = new Scan();
String tableName = "t_student";
conf.set(TableInputFormat.INPUT_TABLE, tableName);
org.apache.hadoop.hbase.protobuf.generated.ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
String ScanToString = Base64.encodeBytes(proto.toByteArray());
conf.set(TableInputFormat.SCAN, ScanToString);
JavaPairRDD<ImmutableBytesWritable, Result> myRDD = context.newAPIHadoopRDD(conf, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
JavaRDD<Row> personsRDD = myRDD.map(new Function<Tuple2<ImmutableBytesWritable, Result>, Row>() {
@Override
public Row call(Tuple2<ImmutableBytesWritable, Result> tuple) throws Exception {
// TODO Auto-generated method stub
System.out.println("====tuple==========" + tuple);
Result result = tuple._2();
String rowkey = Bytes.toString(result.getRow());
String name = Bytes.toString(result.getValue(Bytes.toBytes("lcc_liezu"), Bytes.toBytes("name")));
String sex = Bytes.toString(result.getValue(Bytes.toBytes("lcc_liezu"), Bytes.toBytes("sex")));
String age = Bytes.toString(result.getValue(Bytes.toBytes("lcc_liezu"), Bytes.toBytes("age")));
// 这一点可以直接转化为row类型
return (Row) RowFactory.create(rowkey, name, sex, age);
}
});
List<StructField> structFields = new ArrayList<StructField>();
structFields.add(DataTypes.createStructField("id", DataTypes.StringType, true));
structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
structFields.add(DataTypes.createStructField("sex", DataTypes.StringType, true));
structFields.add(DataTypes.createStructField("age", DataTypes.StringType, true));
StructType schema = DataTypes.createStructType(structFields);
Dataset stuDf = spark.createDataFrame(personsRDD, schema);
// stuDf.select("id","name","age").write().mode(SaveMode.Append).parquet("par");
stuDf.printSchema();
stuDf.createOrReplaceTempView("Person");
Dataset<Row> nameDf = spark.sql("select * from Person ");
nameDf.show();
}
use of org.apache.spark.sql.Dataset in project mm-dev by sbl-sdsc.
the class StructureAligner method getAllVsAllAlignments.
/**
* Calculates all vs. all structural alignments of protein chains using the
* specified alignment algorithm. The input structures must contain single
* protein chains.
*
* @param targets structures containing single protein chains
* @param alignmentAlgorithm name of the algorithm
* @return dataset with alignment metrics
*/
public static Dataset<Row> getAllVsAllAlignments(JavaPairRDD<String, StructureDataInterface> targets, String alignmentAlgorithm) {
SparkSession session = SparkSession.builder().getOrCreate();
JavaSparkContext sc = new JavaSparkContext(session.sparkContext());
// create a list of chainName/ C Alpha coordinates
List<Tuple2<String, Point3d[]>> chains = targets.mapValues(s -> new ColumnarStructureX(s, true).getcAlphaCoordinates()).collect();
// create an RDD of all pair indices (0,1), (0,2), ..., (1,2), (1,3), ...
JavaRDD<Tuple2<Integer, Integer>> pairs = getPairs(sc, chains.size());
// calculate structural alignments for all pairs.
// broadcast (copy) chains to all worker nodes for efficient processing.
// for each pair there can be zero or more solutions, therefore we flatmap the pairs.
JavaRDD<Row> rows = pairs.flatMap(new StructuralAlignmentMapper(sc.broadcast(chains), alignmentAlgorithm));
// convert rows to a dataset
return session.createDataFrame(rows, getSchema());
}
use of org.apache.spark.sql.Dataset in project mmtf-spark by sbl-sdsc.
the class SparkMultiClassClassifier method fit.
/**
* Dataset must at least contain the following two columns:
* label: the class labels
* features: feature vector
* @param data
* @return map with metrics
*/
public Map<String, String> fit(Dataset<Row> data) {
int classCount = (int) data.select(label).distinct().count();
StringIndexerModel labelIndexer = new StringIndexer().setInputCol(label).setOutputCol("indexedLabel").fit(data);
// Split the data into training and test sets (30% held out for testing)
Dataset<Row>[] splits = data.randomSplit(new double[] { 1.0 - testFraction, testFraction }, seed);
Dataset<Row> trainingData = splits[0];
Dataset<Row> testData = splits[1];
String[] labels = labelIndexer.labels();
System.out.println();
System.out.println("Class\tTrain\tTest");
for (String l : labels) {
System.out.println(l + "\t" + trainingData.select(label).filter(label + " = '" + l + "'").count() + "\t" + testData.select(label).filter(label + " = '" + l + "'").count());
}
// Set input columns
predictor.setLabelCol("indexedLabel").setFeaturesCol("features");
// Convert indexed labels back to original labels.
IndexToString labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels());
// Chain indexers and forest in a Pipeline
Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { labelIndexer, predictor, labelConverter });
// Train model. This also runs the indexers.
PipelineModel model = pipeline.fit(trainingData);
// Make predictions.
Dataset<Row> predictions = model.transform(testData).cache();
// Display some sample predictions
System.out.println();
System.out.println("Sample predictions: " + predictor.getClass().getSimpleName());
predictions.sample(false, 0.1, seed).show(25);
predictions = predictions.withColumnRenamed(label, "stringLabel");
predictions = predictions.withColumnRenamed("indexedLabel", label);
// collect metrics
Dataset<Row> pred = predictions.select("prediction", label);
Map<String, String> metrics = new LinkedHashMap<>();
metrics.put("Method", predictor.getClass().getSimpleName());
if (classCount == 2) {
BinaryClassificationMetrics b = new BinaryClassificationMetrics(pred);
metrics.put("AUC", Float.toString((float) b.areaUnderROC()));
}
MulticlassMetrics m = new MulticlassMetrics(pred);
metrics.put("F", Float.toString((float) m.weightedFMeasure()));
metrics.put("Accuracy", Float.toString((float) m.accuracy()));
metrics.put("Precision", Float.toString((float) m.weightedPrecision()));
metrics.put("Recall", Float.toString((float) m.weightedRecall()));
metrics.put("False Positive Rate", Float.toString((float) m.weightedFalsePositiveRate()));
metrics.put("True Positive Rate", Float.toString((float) m.weightedTruePositiveRate()));
metrics.put("", "\nConfusion Matrix\n" + Arrays.toString(labels) + "\n" + m.confusionMatrix().toString());
return metrics;
}
use of org.apache.spark.sql.Dataset in project bunsen by cerner.
the class Bundles method saveAsDatabase.
/**
* Saves an RDD of bundles as a database, where each table
* has the resource name. This offers a simple way to load and query
* bundles in a system, although users with more sophisticated ETL
* operations may want to explicitly write different entities.
*
* <p>
* Note this will access the given RDD of bundles once per resource name,
* so consumers with enough memory should consider calling
* {@link JavaRDD#cache()} so that RDD is not recomputed for each.
* </p>
*
* @param spark the spark session
* @param bundles an RDD of FHIR Bundles
* @param database the name of the database to write to
* @param resourceNames names of resources to be extracted from the bundle and written
*/
public static void saveAsDatabase(SparkSession spark, JavaRDD<Bundle> bundles, String database, String... resourceNames) {
spark.sql("create database if not exists " + database);
for (String resourceName : resourceNames) {
Dataset ds = extractEntry(spark, bundles, resourceName);
ds.write().saveAsTable(database + "." + resourceName.toLowerCase());
}
}
use of org.apache.spark.sql.Dataset in project bunsen by cerner.
the class Hierarchies method expandElements.
/**
* Calculates the transitive closure of ancestor values given the dataset of hierarchical
* elements.
*/
private Dataset<Ancestor> expandElements(String hierarchyUri, String hierarchyVersion, Dataset<HierarchicalElement> elements) {
// Map used to find previously created concept nodes so we can use them to build a graph
final Map<String, Map<String, ConceptNode>> conceptNodes = new HashMap<>();
// List of all nodes for simpler iteration
final List<ConceptNode> allNodes = new ArrayList<>();
// Helper function to get or add a node to our collection of nodes
BiFunction<String, String, ConceptNode> getOrAddNode = (system, value) -> {
Map<String, ConceptNode> systemMap = conceptNodes.get(system);
if (systemMap == null) {
systemMap = new HashMap<>();
conceptNodes.put(system, systemMap);
}
ConceptNode node = systemMap.get(value);
if (node == null) {
node = new ConceptNode(system, value);
systemMap.put(value, node);
allNodes.add(node);
}
return node;
};
// Build our graph of nodes
for (HierarchicalElement element : elements.collectAsList()) {
ConceptNode node = getOrAddNode.apply(element.getDescendantSystem(), element.getDescendantValue());
ConceptNode parent = getOrAddNode.apply(element.getAncestorSystem(), element.getAncestorValue());
node.parents.add(parent);
}
// The graph is built, now translate it into ancestors
List<Ancestor> ancestors = allNodes.stream().flatMap(node -> node.getAncestors().stream().map(ancestorNode -> new Ancestor(hierarchyUri, hierarchyVersion, node.system, node.value, ancestorNode.system, ancestorNode.value))).collect(Collectors.toList());
// We convert into a sliced RDD, then to a dataset, so we can specify a slice size and prevent
// Spark from attempting to copy everything at once for very large expansions.
int slices = (int) (ancestors.size() / ANCESTORS_SLICE_SIZE);
if (slices > 1) {
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Ancestor> rdd = jsc.parallelize(ancestors, slices);
return spark.createDataset(rdd.rdd(), ANCESTOR_ENCODER);
} else {
return spark.createDataset(ancestors, ANCESTOR_ENCODER);
}
}
Aggregations