Search in sources :

Example 1 with GtfsRelationalDao

use of org.onebusaway.gtfs.services.GtfsRelationalDao in project onebusaway-gtfs-modules by OneBusAway.

the class AbstractIdentifiableSingleEntityMergeStrategy method hasLikelyIdentifierOverlap.

/**
   * Determines if the entities sharing the same ids in the source and target
   * feeds appear to be similar enough to indicate that
   * {@link EDuplicateDetectionStrategy#IDENTITY} duplicate detection can be
   * used.
   * 
   * @param context
   * @return true if identity duplicate detection seems appropriate
   */
@SuppressWarnings("unchecked")
private boolean hasLikelyIdentifierOverlap(GtfsMergeContext context) {
    GtfsRelationalDao source = context.getSource();
    GtfsMutableRelationalDao target = context.getTarget();
    Collection<T> targetEntities = (Collection<T>) target.getAllEntitiesForType(_entityType);
    Collection<T> sourceEntities = (Collection<T>) source.getAllEntitiesForType(_entityType);
    Map<Serializable, T> sourceById = MappingLibrary.mapToValue(targetEntities, "id");
    Map<Serializable, T> targetById = MappingLibrary.mapToValue(sourceEntities, "id");
    /**
     * First we check to make sure that the two feeds have enough identifiers in
     * common to suggest that identity-based duplicate detection should be used.
     */
    Set<Serializable> commonIds = new HashSet<Serializable>();
    double elementOvelapScore = DuplicateScoringSupport.scoreElementOverlap(sourceById.keySet(), targetById.keySet(), commonIds);
    if (commonIds.isEmpty() || elementOvelapScore < _minElementsInCommonScoreForAutoDetect) {
        return false;
    }
    /**
     * Now we score entities with the same identifier to see how well they
     * actually match.
     */
    double totalScore = 0.0;
    for (Serializable id : commonIds) {
        T targetEntity = sourceById.get(id);
        T sourceEntity = targetById.get(id);
        totalScore += _duplicateScoringStrategy.score(context, sourceEntity, targetEntity);
    }
    totalScore /= commonIds.size();
    /**
     * If the score is high enough, identity-based duplication detection should
     * be used.
     */
    return totalScore > _minElementsDuplicateScoreForAutoDetect;
}
Also used : GtfsMutableRelationalDao(org.onebusaway.gtfs.services.GtfsMutableRelationalDao) GtfsRelationalDao(org.onebusaway.gtfs.services.GtfsRelationalDao) Serializable(java.io.Serializable) Collection(java.util.Collection) HashSet(java.util.HashSet)

Example 2 with GtfsRelationalDao

use of org.onebusaway.gtfs.services.GtfsRelationalDao in project onebusaway-gtfs-modules by OneBusAway.

the class AbstractIdentifiableSingleEntityMergeStrategy method hasLikelyFuzzyOverlap.

/**
   * Determines if the set entities in the source and target feeds appear to be
   * similar enough when performing fuzzy matching to indicate that
   * {@link EDuplicateDetectionStrategy#FUZZY} duplicate detection can be used.
   * 
   * @param context
   * @return true if fuzzy duplicate detection seems appropriate
   */
@SuppressWarnings("unchecked")
private boolean hasLikelyFuzzyOverlap(GtfsMergeContext context) {
    GtfsRelationalDao source = context.getSource();
    GtfsMutableRelationalDao target = context.getTarget();
    /**
     * TODO: Fuzzy matching is expensive. Do we really want to compare all of
     * the entities? Or would a sufficiently-large subset do the trick? Can any
     * of this be cached for the actual duplicate detection later on?
     */
    Collection<T> targetEntities = (Collection<T>) target.getAllEntitiesForType(_entityType);
    Collection<T> sourceEntities = (Collection<T>) source.getAllEntitiesForType(_entityType);
    double duplicateElements = 0;
    double totalScore = 0.0;
    /**
     * First we determine a rough set of potentially overlapping entities based
     * on a fuzzy match.
     * 
     * We break up the list of searches and spread it across available CPUs.
     */
    int cpus = Runtime.getRuntime().availableProcessors();
    int start = 0;
    int end = targetEntities.size() / cpus;
    int increment = targetEntities.size() / cpus;
    ExecutorService executorService = Executors.newFixedThreadPool(cpus);
    List<Result> results = new ArrayList<Result>(cpus);
    if (end < 10) {
        // no need to segregate is set is small
        Set<T> remainingSourceEntities = new HashSet<T>(sourceEntities);
        Result result = new Result();
        results.add(result);
        executorService.submit(new ScoringTask<T>(context, _duplicateScoringStrategy, targetEntities, remainingSourceEntities, 0, targetEntities.size(), _minElementsInCommonScoreForAutoDetect, result));
    } else {
        for (int i = 0; i < cpus; i++) {
            Collection<T> t_targetEntities = (Collection<T>) target.getAllEntitiesForType(_entityType);
            Collection<T> t_sourceEntities = (Collection<T>) source.getAllEntitiesForType(_entityType);
            Set<T> t_remainingSourceEntities = new HashSet<T>(t_sourceEntities);
            Result result = new Result();
            results.add(result);
            executorService.submit(new ScoringTask<T>(context, _duplicateScoringStrategy, t_targetEntities, t_remainingSourceEntities, start, end, _minElementsInCommonScoreForAutoDetect, result));
            start = end + 1;
            end = end + increment;
        }
    }
    try {
        // give the executor a chance to run
        Thread.sleep(1 * 1000);
    } catch (InterruptedException e1) {
        return false;
    }
    int i = 0;
    for (Result result : results) {
        while (!result.isDone()) {
            try {
                _log.info("waiting on thread[" + i + "] at " + (int) (result.getPercentComplete() * 100) + "% complete (" + _entityType + ")");
                Thread.sleep(30 * 1000);
            } catch (InterruptedException e) {
                return false;
            }
        }
        duplicateElements += result.getDuplicateElements();
        totalScore += result.getTotalScore();
        i++;
    // we no longer remove the best match to avoid concurrency issues
    }
    /**
     * There needs to be sufficient overlap between the two feeds for us to
     * consider using fuzzy duplicate detection in the first place.
     */
    double elementsInCommon = (duplicateElements / targetEntities.size() + duplicateElements / sourceEntities.size()) / 2;
    if (elementsInCommon < _minElementsInCommonScoreForAutoDetect) {
        return false;
    }
    /**
     * If there is sufficient overlap, only use fuzzy detection if the entities
     * themselves match well.
     */
    totalScore /= duplicateElements;
    return totalScore > _minElementsDuplicateScoreForAutoDetect;
}
Also used : GtfsMutableRelationalDao(org.onebusaway.gtfs.services.GtfsMutableRelationalDao) GtfsRelationalDao(org.onebusaway.gtfs.services.GtfsRelationalDao) ArrayList(java.util.ArrayList) ExecutorService(java.util.concurrent.ExecutorService) Collection(java.util.Collection) HashSet(java.util.HashSet)

Example 3 with GtfsRelationalDao

use of org.onebusaway.gtfs.services.GtfsRelationalDao in project onebusaway-gtfs-modules by OneBusAway.

the class GtfsMergerTest method test.

@Test
public void test() throws IOException {
    _oldGtfs.putAgencies(1);
    _oldGtfs.putRoutes(1);
    _oldGtfs.putStops(3);
    _oldGtfs.putCalendars(1, "mask=1111100", "start_date=20120504", "end_date=20120608");
    _oldGtfs.putTrips(1, "r0", "sid0");
    _oldGtfs.putStopTimes("t0", "s0,s1,s2");
    _newGtfs.putAgencies(1);
    _newGtfs.putRoutes(1);
    _newGtfs.putStops(3);
    _newGtfs.putCalendars(1, "mask=1111100", "start_date=20120601", "end_date=20120630");
    _newGtfs.putTrips(1, "r0", "sid0");
    _newGtfs.putStopTimes("t0", "s0,s1");
    GtfsRelationalDao dao = merge();
    assertEquals(1, dao.getAllAgencies().size());
    assertEquals(1, dao.getAllRoutes().size());
    assertEquals(3, dao.getAllStops().size());
    assertEquals(2, dao.getAllTrips().size());
}
Also used : GtfsRelationalDao(org.onebusaway.gtfs.services.GtfsRelationalDao) Test(org.junit.Test)

Example 4 with GtfsRelationalDao

use of org.onebusaway.gtfs.services.GtfsRelationalDao in project onebusaway-gtfs-modules by OneBusAway.

the class GtfsMergerTest method testRenamingTrips.

/**
   * Test that when renaming trips stop times are preserved (issue 14)
   */
@Test
public void testRenamingTrips() throws IOException {
    // modified construction code copied from above test
    _oldGtfs.putAgencies(1);
    _oldGtfs.putRoutes(1);
    _oldGtfs.putStops(3);
    _oldGtfs.putCalendars(1, "mask=1111100", "start_date=20120504", "end_date=20120608");
    _oldGtfs.putTrips(1, "r0", "sid0");
    _oldGtfs.putStopTimes("t0", "s0,s1,s2");
    _newGtfs.putAgencies(1);
    _newGtfs.putRoutes(1);
    _newGtfs.putStops(3);
    _newGtfs.putCalendars(1, "mask=1111100", "start_date=20120601", "end_date=20120601");
    _newGtfs.putTrips(1, "r0", "sid0");
    _newGtfs.putStopTimes("t0", "s0,s1");
    TripMergeStrategy strategy = new TripMergeStrategy();
    strategy.setDuplicateDetectionStrategy(EDuplicateDetectionStrategy.IDENTITY);
    _merger.setTripStrategy(strategy);
    GtfsRelationalDao dao = merge();
    for (Trip trip : dao.getAllTrips()) {
        assertTrue(dao.getStopTimesForTrip(trip).size() > 0);
    }
}
Also used : GtfsRelationalDao(org.onebusaway.gtfs.services.GtfsRelationalDao) Trip(org.onebusaway.gtfs.model.Trip) TripMergeStrategy(org.onebusaway.gtfs_merge.strategies.TripMergeStrategy) Test(org.junit.Test)

Example 5 with GtfsRelationalDao

use of org.onebusaway.gtfs.services.GtfsRelationalDao in project onebusaway-gtfs-modules by OneBusAway.

the class AbstractSingleEntityMergeStrategy method merge.

@Override
public void merge(GtfsMergeContext context) {
    GtfsRelationalDao source = context.getSource();
    Collection<?> entities = source.getAllEntitiesForType(_entityType);
    for (Object entity : entities) {
        mergeEntity(context, (IdentityBean<?>) entity);
    }
}
Also used : GtfsRelationalDao(org.onebusaway.gtfs.services.GtfsRelationalDao)

Aggregations

GtfsRelationalDao (org.onebusaway.gtfs.services.GtfsRelationalDao)29 Test (org.junit.Test)15 AgencyAndId (org.onebusaway.gtfs.model.AgencyAndId)11 Trip (org.onebusaway.gtfs.model.Trip)8 GtfsMutableRelationalDao (org.onebusaway.gtfs.services.GtfsMutableRelationalDao)7 ServiceCalendar (org.onebusaway.gtfs.model.ServiceCalendar)6 ServiceCalendarDate (org.onebusaway.gtfs.model.ServiceCalendarDate)5 Stop (org.onebusaway.gtfs.model.Stop)5 StopTime (org.onebusaway.gtfs.model.StopTime)5 Agency (org.onebusaway.gtfs.model.Agency)3 TripMergeStrategy (org.onebusaway.gtfs_merge.strategies.TripMergeStrategy)3 Collection (java.util.Collection)2 HashSet (java.util.HashSet)2 Frequency (org.onebusaway.gtfs.model.Frequency)2 Route (org.onebusaway.gtfs.model.Route)2 ShapePoint (org.onebusaway.gtfs.model.ShapePoint)2 ServiceDate (org.onebusaway.gtfs.model.calendar.ServiceDate)2 AgencyMergeStrategy (org.onebusaway.gtfs_merge.strategies.AgencyMergeStrategy)2 StopMergeStrategy (org.onebusaway.gtfs_merge.strategies.StopMergeStrategy)2 Serializable (java.io.Serializable)1