use of org.apache.spark.SparkContext in project Gaffer by gchq.
the class ImportRDDOfElementsHandlerTest method checkImportRDDOfElements.
@Test
public void checkImportRDDOfElements() throws OperationException, IOException {
final Graph graph1 = new Graph.Builder().addSchema(getClass().getResourceAsStream("/schema/dataSchema.json")).addSchema(getClass().getResourceAsStream("/schema/dataTypes.json")).addSchema(getClass().getResourceAsStream("/schema/storeTypes.json")).storeProperties(getClass().getResourceAsStream("/store.properties")).build();
final ArrayBuffer<Element> elements = new ArrayBuffer<>();
for (int i = 0; i < 10; i++) {
final Entity entity = new Entity(TestGroups.ENTITY);
entity.setVertex("" + i);
final Edge edge1 = new Edge(TestGroups.EDGE);
edge1.setSource("" + i);
edge1.setDestination("B");
edge1.setDirected(false);
edge1.putProperty(TestPropertyNames.COUNT, 2);
final Edge edge2 = new Edge(TestGroups.EDGE);
edge2.setSource("" + i);
edge2.setDestination("C");
edge2.setDirected(false);
edge2.putProperty(TestPropertyNames.COUNT, 4);
elements.$plus$eq(edge1);
elements.$plus$eq(edge2);
elements.$plus$eq(entity);
}
final User user = new User();
final SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("tests").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").set("spark.kryo.registrator", "uk.gov.gchq.gaffer.spark.serialisation.kryo.Registrator").set("spark.driver.allowMultipleContexts", "true");
final SparkContext sparkContext = new SparkContext(sparkConf);
// Create Hadoop configuration and serialise to a string
final Configuration configuration = new Configuration();
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
configuration.write(new DataOutputStream(baos));
final String configurationString = new String(baos.toByteArray(), CommonConstants.UTF_8);
final String outputPath = this.getClass().getResource("/").getPath().toString() + "load";
final String failurePath = this.getClass().getResource("/").getPath().toString() + "failure";
final File file = new File(outputPath);
if (file.exists()) {
FileUtils.forceDelete(file);
}
final RDD<Element> elementRDD = sparkContext.parallelize(elements, 8, ELEMENT_CLASS_TAG);
final ImportRDDOfElements addRdd = new ImportRDDOfElements.Builder().sparkContext(sparkContext).input(elementRDD).option("outputPath", outputPath).option("failurePath", failurePath).build();
graph1.execute(addRdd, user);
FileUtils.forceDelete(file);
// Check all elements were added
final GetRDDOfAllElements rddQuery = new GetRDDOfAllElements.Builder().sparkContext(sparkContext).option(AbstractGetRDDHandler.HADOOP_CONFIGURATION_KEY, configurationString).build();
final RDD<Element> rdd = graph1.execute(rddQuery, user);
if (rdd == null) {
fail("No RDD returned");
}
final Set<Element> results = new HashSet<>();
final Element[] returnedElements = (Element[]) rdd.collect();
for (int i = 0; i < returnedElements.length; i++) {
results.add(returnedElements[i]);
}
assertEquals(elements.size(), results.size());
sparkContext.stop();
}
use of org.apache.spark.SparkContext in project Gaffer by gchq.
the class GetRDDOfAllElementsHandler method doOperation.
private RDD<Element> doOperation(final GetRDDOfAllElements operation, final Context context, final AccumuloStore accumuloStore) throws OperationException {
final SparkContext sparkContext = operation.getSparkContext();
final Configuration conf = getConfiguration(operation);
addIterators(accumuloStore, conf, context.getUser(), operation);
final RDD<Tuple2<Element, NullWritable>> pairRDD = sparkContext.newAPIHadoopRDD(conf, ElementInputFormat.class, Element.class, NullWritable.class);
return pairRDD.map(new FirstElement(), ClassTagConstants.ELEMENT_CLASS_TAG);
}
use of org.apache.spark.SparkContext in project Gaffer by gchq.
the class GetRDDOfElementsHandler method doOperation.
private RDD<Element> doOperation(final GetRDDOfElements operation, final Context context, final AccumuloStore accumuloStore) throws OperationException {
final SparkContext sparkContext = operation.getSparkContext();
final Configuration conf = getConfiguration(operation);
// Use batch scan option when performing seeded operation
InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true);
addIterators(accumuloStore, conf, context.getUser(), operation);
addRanges(accumuloStore, conf, operation);
final RDD<Tuple2<Element, NullWritable>> pairRDD = sparkContext.newAPIHadoopRDD(conf, ElementInputFormat.class, Element.class, NullWritable.class);
return pairRDD.map(new FirstElement(), ClassTagConstants.ELEMENT_CLASS_TAG);
}
use of org.apache.spark.SparkContext in project zeppelin by apache.
the class SparkInterpreter method createSparkContext_1.
public SparkContext createSparkContext_1() {
logger.info("------ Create new SparkContext {} -------", getProperty("master"));
String execUri = System.getenv("SPARK_EXECUTOR_URI");
String[] jars = null;
if (Utils.isScala2_10()) {
jars = (String[]) Utils.invokeStaticMethod(SparkILoop.class, "getAddedJars");
} else {
jars = (String[]) Utils.invokeStaticMethod(Utils.findClass("org.apache.spark.repl.Main"), "getAddedJars");
}
String classServerUri = null;
String replClassOutputDirectory = null;
try {
// in case of spark 1.1x, spark 1.2x
Method classServer = intp.getClass().getMethod("classServer");
Object httpServer = classServer.invoke(intp);
classServerUri = (String) Utils.invokeMethod(httpServer, "uri");
} catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException | InvocationTargetException e) {
// continue
}
if (classServerUri == null) {
try {
// for spark 1.3x
Method classServer = intp.getClass().getMethod("classServerUri");
classServerUri = (String) classServer.invoke(intp);
} catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException | InvocationTargetException e) {
// continue instead of: throw new InterpreterException(e);
// Newer Spark versions (like the patched CDH5.7.0 one) don't contain this method
logger.warn(String.format("Spark method classServerUri not available due to: [%s]", e.getMessage()));
}
}
if (classServerUri == null) {
try {
// for RcpEnv
Method getClassOutputDirectory = intp.getClass().getMethod("getClassOutputDirectory");
File classOutputDirectory = (File) getClassOutputDirectory.invoke(intp);
replClassOutputDirectory = classOutputDirectory.getAbsolutePath();
} catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException | InvocationTargetException e) {
// continue
}
}
if (Utils.isScala2_11()) {
classServer = createHttpServer(outputDir);
Utils.invokeMethod(classServer, "start");
classServerUri = (String) Utils.invokeMethod(classServer, "uri");
}
conf.setMaster(getProperty("master")).setAppName(getProperty("spark.app.name"));
if (classServerUri != null) {
conf.set("spark.repl.class.uri", classServerUri);
}
if (replClassOutputDirectory != null) {
conf.set("spark.repl.class.outputDir", replClassOutputDirectory);
}
if (jars.length > 0) {
conf.setJars(jars);
}
if (execUri != null) {
conf.set("spark.executor.uri", execUri);
}
if (System.getenv("SPARK_HOME") != null) {
conf.setSparkHome(System.getenv("SPARK_HOME"));
}
conf.set("spark.scheduler.mode", "FAIR");
Properties intpProperty = getProperty();
for (Object k : intpProperty.keySet()) {
String key = (String) k;
String val = toString(intpProperty.get(key));
if (key.startsWith("spark.") && !val.trim().isEmpty()) {
logger.debug(String.format("SparkConf: key = [%s], value = [%s]", key, val));
conf.set(key, val);
}
}
setupConfForPySpark(conf);
setupConfForSparkR(conf);
SparkContext sparkContext = new SparkContext(conf);
return sparkContext;
}
use of org.apache.spark.SparkContext in project zeppelin by apache.
the class SparkSqlInterpreter method interpret.
@Override
public InterpreterResult interpret(String st, InterpreterContext context) {
SQLContext sqlc = null;
SparkInterpreter sparkInterpreter = getSparkInterpreter();
if (sparkInterpreter.getSparkVersion().isUnsupportedVersion()) {
return new InterpreterResult(Code.ERROR, "Spark " + sparkInterpreter.getSparkVersion().toString() + " is not supported");
}
sparkInterpreter.populateSparkWebUrl(context);
sqlc = getSparkInterpreter().getSQLContext();
SparkContext sc = sqlc.sparkContext();
if (concurrentSQL()) {
sc.setLocalProperty("spark.scheduler.pool", "fair");
} else {
sc.setLocalProperty("spark.scheduler.pool", null);
}
sc.setJobGroup(Utils.buildJobGroupId(context), "Zeppelin", false);
Object rdd = null;
try {
// method signature of sqlc.sql() is changed
// from def sql(sqlText: String): SchemaRDD (1.2 and prior)
// to def sql(sqlText: String): DataFrame (1.3 and later).
// Therefore need to use reflection to keep binary compatibility for all spark versions.
Method sqlMethod = sqlc.getClass().getMethod("sql", String.class);
rdd = sqlMethod.invoke(sqlc, st);
} catch (InvocationTargetException ite) {
if (Boolean.parseBoolean(getProperty("zeppelin.spark.sql.stacktrace"))) {
throw new InterpreterException(ite);
}
logger.error("Invocation target exception", ite);
String msg = ite.getTargetException().getMessage() + "\nset zeppelin.spark.sql.stacktrace = true to see full stacktrace";
return new InterpreterResult(Code.ERROR, msg);
} catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException e) {
throw new InterpreterException(e);
}
String msg = ZeppelinContext.showDF(sc, context, rdd, maxResult);
sc.clearJobGroup();
return new InterpreterResult(Code.SUCCESS, msg);
}
Aggregations