use of org.apache.spark.mllib.clustering.KMeans in project geowave by locationtech.
the class KMeansRunner method run.
public void run() throws IOException {
initContext();
// Validate inputs
if (inputDataStore == null) {
LOGGER.error("You must supply an input datastore!");
throw new IOException("You must supply an input datastore!");
}
if (isUseTime()) {
scaledRange = KMeansUtils.setRunnerTimeParams(this, inputDataStore, typeName);
if (scaledRange == null) {
LOGGER.error("Failed to set time params for kmeans. Please specify a valid feature type.");
throw new ParameterException("--useTime option: Failed to set time params");
}
}
// Retrieve the feature adapters
final VectorQueryBuilder bldr = VectorQueryBuilder.newBuilder();
List<String> featureTypeNames;
// If provided, just use the one
if (typeName != null) {
featureTypeNames = new ArrayList<>();
featureTypeNames.add(typeName);
} else {
// otherwise, grab all the feature adapters
featureTypeNames = FeatureDataUtils.getFeatureTypeNames(inputDataStore);
}
bldr.setTypeNames(featureTypeNames.toArray(new String[0]));
// This is required due to some funkiness in GeoWaveInputFormat
final PersistentAdapterStore adapterStore = inputDataStore.createAdapterStore();
final InternalAdapterStore internalAdapterStore = inputDataStore.createInternalAdapterStore();
// Add a spatial filter if requested
try {
if (cqlFilter != null) {
Geometry bbox = null;
String cqlTypeName;
if (typeName == null) {
cqlTypeName = featureTypeNames.get(0);
} else {
cqlTypeName = typeName;
}
final short adapterId = internalAdapterStore.getAdapterId(cqlTypeName);
final DataTypeAdapter<?> adapter = adapterStore.getAdapter(adapterId).getAdapter();
if (adapter instanceof GeotoolsFeatureDataAdapter) {
final String geometryAttribute = ((GeotoolsFeatureDataAdapter) adapter).getFeatureType().getGeometryDescriptor().getLocalName();
Filter filter;
filter = ECQL.toFilter(cqlFilter);
final ExtractGeometryFilterVisitorResult geoAndCompareOpData = (ExtractGeometryFilterVisitorResult) filter.accept(new ExtractGeometryFilterVisitor(GeometryUtils.getDefaultCRS(), geometryAttribute), null);
bbox = geoAndCompareOpData.getGeometry();
}
if ((bbox != null) && !bbox.equals(GeometryUtils.infinity())) {
bldr.constraints(bldr.constraintsFactory().spatialTemporalConstraints().spatialConstraints(bbox).build());
}
}
} catch (final CQLException e) {
LOGGER.error("Unable to parse CQL: " + cqlFilter);
}
// Load RDD from datastore
final RDDOptions kmeansOpts = new RDDOptions();
kmeansOpts.setMinSplits(minSplits);
kmeansOpts.setMaxSplits(maxSplits);
kmeansOpts.setQuery(bldr.build());
final GeoWaveRDD kmeansRDD = GeoWaveRDDLoader.loadRDD(session.sparkContext(), inputDataStore, kmeansOpts);
// Retrieve the input centroids
LOGGER.debug("Retrieving input centroids from RDD...");
centroidVectors = RDDUtils.rddFeatureVectors(kmeansRDD, timeField, scaledTimeRange);
centroidVectors.cache();
// Init the algorithm
final KMeans kmeans = new KMeans();
kmeans.setInitializationMode("kmeans||");
kmeans.setK(numClusters);
kmeans.setMaxIterations(numIterations);
if (epsilon > -1.0) {
kmeans.setEpsilon(epsilon);
}
// Run KMeans
LOGGER.debug("Running KMeans algorithm...");
outputModel = kmeans.run(centroidVectors.rdd());
LOGGER.debug("Writing results to output store...");
writeToOutputStore();
LOGGER.debug("Results successfully written!");
}
Aggregations