Python Examples of geopandas.sjoin

Source File: test_geopandas.py From docker-python with Apache License 2.0

6 votes

def test_spatial_join(self):
        cities = geopandas.read_file(geopandas.datasets.get_path('naturalearth_cities'))
        world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
        countries = world[['geometry', 'name']]
        countries = countries.rename(columns={'name':'country'})
        cities_with_country = geopandas.sjoin(cities, countries, how="inner", op='intersects')
        self.assertTrue(cities_with_country.size > 1)

Source File: data_extract.py From urbansprawl with MIT License

5 votes

def get_df_extract(df_data, poly_gdf, operation = "within"):
	"""
	Indexes input geo-data frame within an input region of interest
	If the region of interest is given as a polygon, its bounding box is indexed

	Parameters
	----------
	df_data : geopandas.GeoDataFrame
		input data frame to index
	poly_gdf : geopandas.GeoDataFrame
		geodataframe containing the region of interest in form of polygon
	operation : string
		the desired spatial join operation: 'within' or 'intersects'

	Returns
	----------
	geopandas.GeoDataFrame
		returns the population data frame indexed within the region of interest
	"""
	# Project to same system coordinates
	poly_gdf = ox.project_gdf(poly_gdf, to_crs=df_data.crs)
	# Spatial join
	df_extract = gpd.sjoin(df_data, poly_gdf, op=operation)
	# Keep original columns
	df_extract = df_extract[ df_data.columns ]
	return df_extract

Source File: utils.py From urbansprawl with MIT License

5 votes

def population_downscaling_validation(df_osm_built, df_insee):
	"""
	Validates the population downscaling estimation by means of aggregating the sum of buildings estimated population lying within each population square
	Allows to compare the real population count with the estimated population lying within each square
	Updates new column 'pop_estimation' for each square in the population data frame

	Parameters
	----------
	df_osm_built : geopandas.GeoDataFrame
		input buildings with computed population count
	df_insee : geopandas.GeoDataFrame
		INSEE population data

	Returns
	----------

	"""
	df_osm_built['geom'] = df_osm_built.geometry
	df_osm_built_residential = df_osm_built[ df_osm_built.apply(lambda x: x.landuses_m2['residential'] > 0, axis = 1) ]
	df_insee.crs = df_osm_built_residential.crs

	# Intersecting gridded population - buildings
	sjoin = gpd.sjoin( df_insee, df_osm_built_residential, op='intersects')
	# Calculate area within square (percentage of building with the square)
	sjoin['pop_estimation'] = sjoin.apply(lambda x: x.population * (x.geom.intersection(x.geometry).area / x.geom.area), axis=1 )
	
	# Initialize
	df_insee['pop_estimation'] = np.nan
	sum_pop_per_square = sjoin.groupby(sjoin.index)['pop_estimation'].sum()
	
	df_insee.loc[ sum_pop_per_square.index, "pop_estimation" ] = sum_pop_per_square.values
	# Drop unnecessary column
	df_osm_built.drop('geom', axis=1, inplace=True)
	# Set to 0 nan values
	df_insee.loc[ df_insee.pop_estimation.isnull(), "pop_estimation" ] = 0

	# Compute absolute and relative error
	df_insee["absolute_error"] = df_insee.apply(lambda x: abs(x.pop_count - x.pop_estimation), axis=1)
	df_insee["relative_error"] = df_insee.apply(lambda x: abs(x.pop_count - x.pop_estimation) / x.pop_count, axis=1)

Source File: refine.py From OpenSarToolkit with MIT License

5 votes

def _remove_outside_aoi(aoi_gdf, inventory_df):
    '''Removes scenes that are located outside the AOI

    The search routine works over a simplified representation of the AOI.
    This may then include acquistions that do not overlap with the AOI.

    In this step we sort out the scenes that are completely outside the
    actual AOI.

    Args:
        aoi_gdf (gdf): the aoi as an GeoDataFrame
        inventory_df (gdf): an OST compliant Sentinel-1 inventory GeoDataFrame

    Returns:
        inventory_df (gdf): the manipulated inventory GeodataFrame

    '''

    # get columns of input dataframe for later return function
    cols = inventory_df.columns

    # 1) get only intersecting footprints (double, since we do this before)
    inventory_df = gpd.sjoin(inventory_df, aoi_gdf,
                             how='inner', op='intersects')

    # if aoi  gdf has an id field we need to rename the changed id_left field
    if 'id_left' in inventory_df.columns.tolist():
        # rename id_left to id
        inventory_df.columns = [
            'id' if x == 'id_left' else x
            for x in inventory_df.columns.tolist()]

    return inventory_df[cols]

Source File: BaseModel.py From spatial_access with GNU General Public License v3.0

5 votes

def _spatial_join_community_index(self, dataframe, shapefile='data/chicago_boundaries/chicago_boundaries.shp',
                                      spatial_index='community',  projection='epsg:4326'):
        """
        Join the dataframe with location data from shapefile.
        Args:
            dataframe: pandas dataframe with unique id.
            shapefile: shapefile containing geometry.
            spatial_index: column names of aggregation area in shapefile.
            projection: defaults to 'epsg:4326'
        Returns: dataframe.
        Raises:
            ShapefileNotFoundException: Shapefile not found.
            SpatialIndexNotMatchedException: spatial_index not found in shapefile.
        """
        geometry = [Point(xy) for xy in zip(dataframe['lon'], dataframe['lat'])]
        crs = {'init': projection}
        geo_original = gpd.GeoDataFrame(dataframe, crs=crs, geometry=geometry)
        try:
            boundaries_gdf = gpd.read_file(shapefile)
        except FileNotFoundError:
            raise ShapefileNotFoundException('shapefile not found: {}'.format(shapefile))

        geo_result = gpd.sjoin(boundaries_gdf, geo_original, how='right',
                               op='intersects')

        dataframe_columns = list(dataframe.columns)

        geo_result.rename(columns={spatial_index: 'spatial_index'}, inplace=True)
        dataframe_columns.append('spatial_index')
        dataframe_columns.append('geometry')
        try:
            geo_result = geo_result[dataframe_columns]
        except KeyError:
            raise SpatialIndexNotMatchedException('Unable to match spatial_index:{}'.format(spatial_index))
        if len(geo_result) != len(dataframe):
            self.logger.warning('Length of joined dataframe ({}) != length of input dataframe ({})'
                                .format(len(geo_result), len(dataframe)))
        return geo_result

Source File: downscaling.py From urbansprawl with MIT License

4 votes

def proportional_population_downscaling(df_osm_built, df_insee):
	"""
	Performs a proportional population downscaling considering the surface dedicated to residential land use
	Associates the estimated population to each building in column 'population'

	Parameters
	----------
	df_osm_built : geopandas.GeoDataFrame
		input buildings with computed residential surface
	df_insee : geopandas.GeoDataFrame
		INSEE population data

	Returns
	----------

	"""
	if (df_insee.crs != df_osm_built.crs): # If projections do not match
		# First project to Lat-Long coordinates, then project to UTM coordinates
		df_insee = ox.project_gdf( ox.project_gdf(df_insee, to_latlong=True) )

		# OSM Building geometries are already projected
		assert(df_insee.crs == df_osm_built.crs)

	df_osm_built['geom'] = df_osm_built.geometry
	df_osm_built_residential = df_osm_built[ df_osm_built.apply(lambda x: x.landuses_m2['residential'] > 0, axis = 1) ]

	# Loading/saving using geopandas loses the 'ellps' key
	df_insee.crs = df_osm_built_residential.crs

	# Intersecting gridded population - buildings
	sjoin = gpd.sjoin( df_insee, df_osm_built_residential, op='intersects')
	# Calculate area within square (percentage of building with the square)
	sjoin['residential_m2_within'] = sjoin.apply(lambda x: x.landuses_m2['residential'] * (x.geom.intersection(x.geometry).area / x.geom.area), axis=1 )
	# Initialize
	df_insee['residential_m2_within'] = 0
	# Sum residential area within square
	sum_m2_per_square = sjoin.groupby(sjoin.index)['residential_m2_within'].sum()
	# Assign total residential area within each square
	df_insee.loc[ sum_m2_per_square.index, "residential_m2_within" ] = sum_m2_per_square.values
	# Get number of M^2 / person
	df_insee[ "m2_per_person" ] = df_insee.apply(lambda x: x.residential_m2_within / x.pop_count, axis=1)

	def population_building(x, df_insee):
		# Sum of: For each square: M2 of building within square / M2 per person
		return ( x.get('m2',[]) / df_insee.loc[ x.get('idx',[]) ].m2_per_person ).sum()
	# Index: Buildings , Values: idx:Indices of gridded square population, m2: M2 within that square
	buildings_square_m2_association = sjoin.groupby('index_right').apply(lambda x: {'idx':list(x.index), 'm2':list(x.residential_m2_within)} )
	# Associate
	df_osm_built.loc[ buildings_square_m2_association.index, "population" ] = buildings_square_m2_association.apply(lambda x: population_building(x,df_insee) )
	# Drop unnecessary column
	df_osm_built.drop('geom', axis=1, inplace=True)

Source File: utils.py From urbansprawl with MIT License

4 votes

def get_population_df_filled_empty_squares(df_insee):
	""" 
	Add empty squares as 0-population box-squares

	Parameters
	----------
	df_insee : geopandas.GeoDataFrame
		INSEE population data

	Returns
	----------

	"""
	def get_northing_easting(x): # Extract northing and easting coordinates
		north, east = x.idINSPIRE.split("N")[1].split("E")
		x["north"] = int(north)
		x["east"] = int(east)
		return x

	# Project data to its original projection coordinates
	df_insee_3035 = ox.project_gdf(df_insee, to_crs="+init=epsg:3035")
	
	# Get northing and easting coordinates
	coordinates = df_insee.apply(lambda x: get_northing_easting(x), axis=1 )[["north","east"]]

	# +100 meters to obtain the centroid of each box
	coords_offset = 100
	# Input data step
	step = 200.

	# North, east coordinates denote the south-west box endpoint: 
	north_min, north_max = coordinates.north.min() + coords_offset, coordinates.north.max() + coords_offset
	east_min, east_max = coordinates.east.min() + coords_offset, coordinates.east.max() + coords_offset
	
	# Create mesh grid: One point for each square's centroid: Each square has an extent of 1km by 1km
	xv, yv = np.meshgrid( np.arange(east_min, east_max, step), np.arange(north_min, north_max, step) )
	
	# For every given coordinate, if a box is not created (no population), make it with an initial population of 0
	empty_population_box = []

	for E, N in zip( xv.ravel(), yv.ravel() ): # Center-point
		point_df = gpd.GeoDataFrame( [Point(E,N)], columns=[ "geometry" ], crs="+init=epsg:3035" )
		if ( gpd.sjoin( point_df, df_insee_3035 ).empty ): # Does not intersect any existing square-box
			# Create new square
			empty_population_box.append( Polygon([ (E - 100., N - 100.), (E - 100., N + 100. ), (E + 100., N + 100. ), (E + 100., N - 100. ), (E - 100., N - 100. ) ]) )

	# Concatenate original data frame + Empty squares
	gdf_concat = pd.concat( [df_insee_3035, gpd.GeoDataFrame( {'geometry':empty_population_box, 'pop_count':[0]*len(empty_population_box) }, crs="+init=epsg:3035" ) ], ignore_index=True, sort=False )

	# Remove added grid-cells outside the convex hull of the population data frame
	df_insee_convex_hull_3035 = df_insee_3035.unary_union.convex_hull
	gdf_concat = gdf_concat[ gdf_concat.apply(lambda x: df_insee_convex_hull_3035.intersects(x.geometry), axis=1 ) ]
	gdf_concat.reset_index(drop=True, inplace=True)	

	# Project (First project to latitude-longitude due to GeoPandas issue)
	return ox.project_gdf( ox.project_gdf(gdf_concat, to_latlong=True) )

Source File: classification.py From urbansprawl with MIT License

4 votes

def compute_landuse_inference(df_buildings, df_landuse):
	""" 
	Compute land use inference for building polygons with no information
	The inference is done using polygons with defined land use
	A building polygon's land use is inferred by means of adopting the land use of the smallest encompassing polygon with defined land use

	Parameters
	----------
	df_buildings : geopandas.GeoDataFrame
		input buildings
	df_landuse : geopandas.GeoDataFrame
		land use polygons to aid inference procedure

	Returns
	----------
	
	"""
	# Get those indices which need to be inferred, and keep geometry column only
	df_buildings_to_infer = df_buildings.loc[ df_buildings['classification'] == 'infer', ["geometry"] ]
	# Add land use polygon's area
	df_landuse['area'] = df_landuse.apply(lambda x: x.geometry.area, axis=1)

	# Get geometries to infer within Land use polygons matching
	sjoin = gpd.sjoin(df_buildings_to_infer, df_landuse, op='within')

	# Add index column to sort values
	sjoin['index'] = sjoin.index
	# Sort values by index, then by area
	sjoin.sort_values(by=['index','area'], inplace=True)
	# Drop duplicates. Keep first (minimum computing area)
	sjoin.drop_duplicates(subset=['index'], keep='first', inplace=True)

	##### Set key:value and classification
	# Set default value: inferred:None
	df_buildings.loc[ df_buildings_to_infer.index, "key_value" ] = df_buildings.loc[ df_buildings_to_infer.index].apply(lambda x: {"inferred":None} , axis=1)
	# Set land use for those buildings within a defined land use polygon
	df_buildings.loc[ sjoin.index, "key_value" ] = sjoin.apply(lambda x: {'inferred':x.landuse}, axis=1)

	# Set classification
	df_buildings.loc[ df_buildings_to_infer.index, "classification" ] = df_buildings.loc[ df_buildings_to_infer.index, "key_value" ].apply(lambda x: classify_landuse_inference(x.get("inferred")) )

	# Remove useless rows
	df_buildings.drop( df_buildings[ df_buildings.classification.isin([None,"other"]) ].index, inplace=True)
	df_buildings.reset_index(inplace=True,drop=True)
	assert( len( df_buildings[df_buildings.classification.isnull()] ) == 0 )

############################################
### Activity type classification
############################################

Source File: mesh.py From dhitools with MIT License

4 votes

def lyr_from_shape(self, lyr_name, input_shp, field_attribute,
                       output_shp=None):
        """
        Create a model input layer at mesh element coordinates.

        For example, input_shp is a roughness map containing polygons with
        roughness values. A spatial join  is performed for mesh element points
        within input_shp polygons and returns field_attributeat element points.

        Parameters
        ----------
        lyr_name : str
            Layer name as key to `lyrs` attribute dictionary
        input_shp : str
            Path to input shape file
        field_attributes : str
            Attribute in `input_shp` to extract at mesh elements
        output_shp : str, optional
            output path to write to .shp file

        Returns
        -------
        Inserts `lyr_name` into the `lyrs` attribute dictionary as an ndarray,
        shape (num_elements,) with extracted `field_attributes` value for each
        mesh element
        """

        # Load input_shp to GeoDF
        input_df = gpd.read_file(input_shp)

        # Load mesh element points as GeoDF
        mesh_df = self.to_gpd()

        # Perform spatial join
        join_df = gpd.sjoin(mesh_df, input_df, how='left', op='within')

        # Drop duplicated points; there is the potential to have duplicated
        # points when they intersect two different polygons. Keep the first
        join_df = join_df[~join_df.index.duplicated(keep='first')]

        self.lyrs[lyr_name] = np.array(join_df[field_attribute])

        if output_shp is not None:
            join_df.to_file(output_shp)

Source File: generators.py From PowerGenome with MIT License

4 votes

def label_hydro_region(gens_860, pudl_engine, model_regions_gdf):
    """
    Label hydro facilities that don't have a region by default.

    Parameters
    ----------
    gens_860 : dataframe
        Infomation on all generators from PUDL
    pudl_engine : sqlalchemy.Engine
        A sqlalchemy connection for use by pandas
    model_regions_gdf : dataframe
        Geodataframe of the model regions

    Returns
    -------
    dataframe
        Plant id and region for any hydro that didn't originally have a region label.
    """

    plant_entity = pd.read_sql_table("plants_entity_eia", pudl_engine)

    model_hydro = gens_860.loc[
        gens_860["technology_description"] == "Conventional Hydroelectric"
    ].merge(plant_entity[["plant_id_eia", "latitude", "longitude"]], on="plant_id_eia")

    no_lat_lon = model_hydro.loc[
        (model_hydro["latitude"].isnull()) | (model_hydro["longitude"].isnull()), :
    ]
    if not no_lat_lon.empty:
        print(no_lat_lon["summer_capacity_mw"].sum(), " MW without lat/lon")
    model_hydro = model_hydro.dropna(subset=["latitude", "longitude"])

    # Convert the lon/lat values to geo points. Need to add an initial CRS and then
    # change it to align with the IPM regions
    model_hydro_gdf = gpd.GeoDataFrame(
        model_hydro,
        geometry=gpd.points_from_xy(model_hydro.longitude, model_hydro.latitude),
        crs={"init": "epsg:4326"},
    )

    if model_hydro_gdf.crs != model_regions_gdf.crs:
        model_hydro_gdf = model_hydro_gdf.to_crs(model_regions_gdf.crs)

    model_hydro_gdf = gpd.sjoin(model_regions_gdf, model_hydro_gdf)
    model_hydro_gdf = model_hydro_gdf.rename(columns={"IPM_Region": "region"})

    keep_cols = ["plant_id_eia", "region"]
    return model_hydro_gdf.loc[:, keep_cols]

Source File: burst.py From OpenSarToolkit with MIT License

4 votes

def refine_burst_inventory(aoi, burst_gdf, outfile, coverages=None):
    '''Creates a Burst GeoDataFrame from an OST inventory file

    Args:

    Returns:


    '''

    # turn aoi into a geodataframe
    aoi_gdf = gpd.GeoDataFrame(vec.wkt_to_gdf(aoi).buffer(0.05))
    aoi_gdf.columns = ['geometry']
    # get columns of input dataframe for later return function
    cols = burst_gdf.columns

    # 1) get only intersecting footprints (double, since we do this before)
    burst_gdf = gpd.sjoin(burst_gdf, aoi_gdf, how='inner', op='intersects')

    # if aoi  gdf has an id field we need to rename the changed id_left field
    if 'id_left' in burst_gdf.columns.tolist():
        # rename id_left to id
        burst_gdf.columns = (['id' if x == 'id_left' else x
                              for x in burst_gdf.columns.tolist()])
    
    # remove duplicates
    burst_gdf.drop_duplicates(['SceneID', 'Date', 'bid'], inplace=True)
    
    # check if number of bursts align with number of coverages
    if coverages:
        for burst in burst_gdf.bid.unique():
            if len(burst_gdf[burst_gdf.bid == burst]) != coverages:
                print(' INFO. Removing burst {} because of'
                      ' unsuffcient coverage.'.format(burst))
                burst_gdf.drop(burst_gdf[burst_gdf.bid == burst].index, 
                               inplace=True)
    
    # save file to out
    burst_gdf['Date'] = burst_gdf['Date'].astype(str)
    burst_gdf['BurstNr'] = burst_gdf['BurstNr'].astype(str)
    burst_gdf['AnxTime'] = burst_gdf['AnxTime'].astype(str)
    burst_gdf['Track'] = burst_gdf['Track'].astype(str)
    burst_gdf.to_file(outfile)
    return burst_gdf[cols]

Source File: geometries.py From deeposlandia with MIT License

4 votes

def extract_tile_items(
    raster_features, labels, min_x, min_y, tile_width, tile_height
):
    """Extract label items that belong to the tile defined by the minimum
    horizontal pixel `min_x` (left tile limit), the minimum vertical pixel
    `min_y` (upper tile limit) and the sizes ̀tile_width` and `tile_height`
    measured as a pixel amount.

    The tile is cropped from the original image raster as follows:
      - horizontally, between `min_x` and `min_x+tile_width`
      - vertically, between `min_y` and `min_y+tile_height`

    This method takes care of original data projection (UTM 37S, Tanzania
    area), however this parameter may be changed if similar data on another
    projection is used.

    Parameters
    ----------
    raster_features : dict
        Raw image raster geographical features (`north`, `south`, `east` and
    `west` coordinates, `weight` and `height` measured in pixels)
    labels : geopandas.GeoDataFrame
        Raw image labels, as a set of geometries
    min_x : int
        Left tile limit, as a horizontal pixel index
    min_y : int
        Upper tile limit, as a vertical pixel index
    tile_width : int
        Tile width, measured in pixel
    tile_height : int
        Tile height, measured in pixel

    Returns
    -------
    geopandas.GeoDataFrame
        Set of ground-truth labels contained into the tile, characterized by
    their type (complete, unfinished or foundation) and their geometry

    """
    area = get_tile_footprint(
        raster_features, min_x, min_y, tile_width, tile_height
    )
    bdf = gpd.GeoDataFrame(
        crs=from_epsg(raster_features["srid"]), geometry=[area]
    )
    reproj_labels = labels.to_crs(epsg=raster_features["srid"])
    tile_items = gpd.sjoin(reproj_labels, bdf)
    if tile_items.shape[0] == 0:
        return tile_items[["condition", "geometry"]]
    tile_items = gpd.overlay(tile_items, bdf)
    tile_items = tile_items.explode()  # Manage MultiPolygons
    return tile_items[["condition", "geometry"]]

Python geopandas.sjoin() Examples