use of org.onebusaway.gtfs.services.GtfsRelationalDao in project onebusaway-gtfs-modules by OneBusAway.
the class AbstractIdentifiableSingleEntityMergeStrategy method hasLikelyIdentifierOverlap.
/**
* Determines if the entities sharing the same ids in the source and target
* feeds appear to be similar enough to indicate that
* {@link EDuplicateDetectionStrategy#IDENTITY} duplicate detection can be
* used.
*
* @param context
* @return true if identity duplicate detection seems appropriate
*/
@SuppressWarnings("unchecked")
private boolean hasLikelyIdentifierOverlap(GtfsMergeContext context) {
GtfsRelationalDao source = context.getSource();
GtfsMutableRelationalDao target = context.getTarget();
Collection<T> targetEntities = (Collection<T>) target.getAllEntitiesForType(_entityType);
Collection<T> sourceEntities = (Collection<T>) source.getAllEntitiesForType(_entityType);
Map<Serializable, T> sourceById = MappingLibrary.mapToValue(targetEntities, "id");
Map<Serializable, T> targetById = MappingLibrary.mapToValue(sourceEntities, "id");
/**
* First we check to make sure that the two feeds have enough identifiers in
* common to suggest that identity-based duplicate detection should be used.
*/
Set<Serializable> commonIds = new HashSet<Serializable>();
double elementOvelapScore = DuplicateScoringSupport.scoreElementOverlap(sourceById.keySet(), targetById.keySet(), commonIds);
if (commonIds.isEmpty() || elementOvelapScore < _minElementsInCommonScoreForAutoDetect) {
return false;
}
/**
* Now we score entities with the same identifier to see how well they
* actually match.
*/
double totalScore = 0.0;
for (Serializable id : commonIds) {
T targetEntity = sourceById.get(id);
T sourceEntity = targetById.get(id);
totalScore += _duplicateScoringStrategy.score(context, sourceEntity, targetEntity);
}
totalScore /= commonIds.size();
/**
* If the score is high enough, identity-based duplication detection should
* be used.
*/
return totalScore > _minElementsDuplicateScoreForAutoDetect;
}
use of org.onebusaway.gtfs.services.GtfsRelationalDao in project onebusaway-gtfs-modules by OneBusAway.
the class AbstractIdentifiableSingleEntityMergeStrategy method hasLikelyFuzzyOverlap.
/**
* Determines if the set entities in the source and target feeds appear to be
* similar enough when performing fuzzy matching to indicate that
* {@link EDuplicateDetectionStrategy#FUZZY} duplicate detection can be used.
*
* @param context
* @return true if fuzzy duplicate detection seems appropriate
*/
@SuppressWarnings("unchecked")
private boolean hasLikelyFuzzyOverlap(GtfsMergeContext context) {
GtfsRelationalDao source = context.getSource();
GtfsMutableRelationalDao target = context.getTarget();
/**
* TODO: Fuzzy matching is expensive. Do we really want to compare all of
* the entities? Or would a sufficiently-large subset do the trick? Can any
* of this be cached for the actual duplicate detection later on?
*/
Collection<T> targetEntities = (Collection<T>) target.getAllEntitiesForType(_entityType);
Collection<T> sourceEntities = (Collection<T>) source.getAllEntitiesForType(_entityType);
double duplicateElements = 0;
double totalScore = 0.0;
/**
* First we determine a rough set of potentially overlapping entities based
* on a fuzzy match.
*
* We break up the list of searches and spread it across available CPUs.
*/
int cpus = Runtime.getRuntime().availableProcessors();
int start = 0;
int end = targetEntities.size() / cpus;
int increment = targetEntities.size() / cpus;
ExecutorService executorService = Executors.newFixedThreadPool(cpus);
List<Result> results = new ArrayList<Result>(cpus);
if (end < 10) {
// no need to segregate is set is small
Set<T> remainingSourceEntities = new HashSet<T>(sourceEntities);
Result result = new Result();
results.add(result);
executorService.submit(new ScoringTask<T>(context, _duplicateScoringStrategy, targetEntities, remainingSourceEntities, 0, targetEntities.size(), _minElementsInCommonScoreForAutoDetect, result));
} else {
for (int i = 0; i < cpus; i++) {
Collection<T> t_targetEntities = (Collection<T>) target.getAllEntitiesForType(_entityType);
Collection<T> t_sourceEntities = (Collection<T>) source.getAllEntitiesForType(_entityType);
Set<T> t_remainingSourceEntities = new HashSet<T>(t_sourceEntities);
Result result = new Result();
results.add(result);
executorService.submit(new ScoringTask<T>(context, _duplicateScoringStrategy, t_targetEntities, t_remainingSourceEntities, start, end, _minElementsInCommonScoreForAutoDetect, result));
start = end + 1;
end = end + increment;
}
}
try {
// give the executor a chance to run
Thread.sleep(1 * 1000);
} catch (InterruptedException e1) {
return false;
}
int i = 0;
for (Result result : results) {
while (!result.isDone()) {
try {
_log.info("waiting on thread[" + i + "] at " + (int) (result.getPercentComplete() * 100) + "% complete (" + _entityType + ")");
Thread.sleep(30 * 1000);
} catch (InterruptedException e) {
return false;
}
}
duplicateElements += result.getDuplicateElements();
totalScore += result.getTotalScore();
i++;
// we no longer remove the best match to avoid concurrency issues
}
/**
* There needs to be sufficient overlap between the two feeds for us to
* consider using fuzzy duplicate detection in the first place.
*/
double elementsInCommon = (duplicateElements / targetEntities.size() + duplicateElements / sourceEntities.size()) / 2;
if (elementsInCommon < _minElementsInCommonScoreForAutoDetect) {
return false;
}
/**
* If there is sufficient overlap, only use fuzzy detection if the entities
* themselves match well.
*/
totalScore /= duplicateElements;
return totalScore > _minElementsDuplicateScoreForAutoDetect;
}
use of org.onebusaway.gtfs.services.GtfsRelationalDao in project onebusaway-gtfs-modules by OneBusAway.
the class GtfsMergerTest method test.
@Test
public void test() throws IOException {
_oldGtfs.putAgencies(1);
_oldGtfs.putRoutes(1);
_oldGtfs.putStops(3);
_oldGtfs.putCalendars(1, "mask=1111100", "start_date=20120504", "end_date=20120608");
_oldGtfs.putTrips(1, "r0", "sid0");
_oldGtfs.putStopTimes("t0", "s0,s1,s2");
_newGtfs.putAgencies(1);
_newGtfs.putRoutes(1);
_newGtfs.putStops(3);
_newGtfs.putCalendars(1, "mask=1111100", "start_date=20120601", "end_date=20120630");
_newGtfs.putTrips(1, "r0", "sid0");
_newGtfs.putStopTimes("t0", "s0,s1");
GtfsRelationalDao dao = merge();
assertEquals(1, dao.getAllAgencies().size());
assertEquals(1, dao.getAllRoutes().size());
assertEquals(3, dao.getAllStops().size());
assertEquals(2, dao.getAllTrips().size());
}
use of org.onebusaway.gtfs.services.GtfsRelationalDao in project onebusaway-gtfs-modules by OneBusAway.
the class GtfsMergerTest method testRenamingTrips.
/**
* Test that when renaming trips stop times are preserved (issue 14)
*/
@Test
public void testRenamingTrips() throws IOException {
// modified construction code copied from above test
_oldGtfs.putAgencies(1);
_oldGtfs.putRoutes(1);
_oldGtfs.putStops(3);
_oldGtfs.putCalendars(1, "mask=1111100", "start_date=20120504", "end_date=20120608");
_oldGtfs.putTrips(1, "r0", "sid0");
_oldGtfs.putStopTimes("t0", "s0,s1,s2");
_newGtfs.putAgencies(1);
_newGtfs.putRoutes(1);
_newGtfs.putStops(3);
_newGtfs.putCalendars(1, "mask=1111100", "start_date=20120601", "end_date=20120601");
_newGtfs.putTrips(1, "r0", "sid0");
_newGtfs.putStopTimes("t0", "s0,s1");
TripMergeStrategy strategy = new TripMergeStrategy();
strategy.setDuplicateDetectionStrategy(EDuplicateDetectionStrategy.IDENTITY);
_merger.setTripStrategy(strategy);
GtfsRelationalDao dao = merge();
for (Trip trip : dao.getAllTrips()) {
assertTrue(dao.getStopTimesForTrip(trip).size() > 0);
}
}
use of org.onebusaway.gtfs.services.GtfsRelationalDao in project onebusaway-gtfs-modules by OneBusAway.
the class AbstractSingleEntityMergeStrategy method merge.
@Override
public void merge(GtfsMergeContext context) {
GtfsRelationalDao source = context.getSource();
Collection<?> entities = source.getAllEntitiesForType(_entityType);
for (Object entity : entities) {
mergeEntity(context, (IdentityBean<?>) entity);
}
}
Aggregations