Source code for landlab.io.shapefile.read_shapefile

#!/usr/bin/env python3
"""Functions to read shapefiles and create a NetworkModelGrid."""
import pathlib

import numpy as np
import pandas as pd
import shapefile as ps

from landlab.graph.graph import NetworkGraph
from landlab.grid.network import NetworkModelGrid


def _read_shapefile(file, dbf):
    kwds = {}
    if dbf is not None:
        kwds["dbf"] = dbf

    if isinstance(file, (str, pathlib.Path)):
        sf = ps.Reader(str(file), **kwds)
    else:
        sf = ps.Reader(shp=file, **kwds)

    return sf


_NUMPY_DTYPE = {
    "integer": (int, float),
    "mixed": (float, complex),
    "mixed-integer": (int, complex),
    "mixed-integer-float": (float,),
    "floating": (float,),
    "complex": (complex,),
    "boolean": (bool,),
}


def _infer_data_type(array, dtype=None):
    """Infer the type of a numpy array.

    Infer the data type of a numpy array based on its values. This function
    is most useful when used on arrays that have *dtype=object* but all of
    its elements are of a standard data type.

    Parameters
    ----------
    array : array_like
        Input data to infer its data type.
    dtype : data-type, optional
        If not provided, infer data type from array elements; otherwise,
        try to cast to the provided data type.

    Returns
    -------
    out : ndarray
        Array interpretation of the input array with either the provided
        data type or one inferred from its elements.

    Examples
    --------
    >>> import numpy as np

    >>> _infer_data_type([1, 2, 3])
    array([1, 2, 3])
    >>> _infer_data_type([1, 2, 3], dtype=float)
    array([ 1.,  2.,  3.])
    >>> _infer_data_type(np.array([1.0, 2.0, 3.0], dtype=object))
    array([ 1.,  2.,  3.])
    >>> _infer_data_type([[1, 2, 3]])
    array([[1, 2, 3]])
    >>> _infer_data_type([None, 1, 2, 3])
    array([ nan,   1.,   2.,   3.])
    """
    array = np.asarray(array, dtype=dtype)
    if dtype is None and not np.issubdtype(array.dtype, np.number):
        infered_dtype = pd.api.types.infer_dtype(array, skipna=True)
        for dtype in _NUMPY_DTYPE.get(infered_dtype, ()):
            try:
                _array = np.asarray(array.flatten(), dtype=dtype)
            except (TypeError, ValueError):
                pass
            else:
                array = _array.reshape(array.shape)
                break
    return array



[docs]
def read_shapefile(
    file,
    dbf=None,
    store_polyline_vertices=True,
    points_shapefile=None,
    points_dbf=None,
    link_fields=None,
    node_fields=None,
    link_field_conversion=None,
    node_field_conversion=None,
    link_field_dtype=None,
    node_field_dtype=None,
    threshold=0.0,
):
    """Read shapefile and create a NetworkModelGrid.

    There are a number of assumptions that are required about the shapefile.
        * The shapefile must be a polyline shapefile.
        * All polylines must be their own object (e.g. no multi-part
          polylines).
        * Polyline endpoints match perfectly.

    You might notice that there is no ``write_shapefile`` function. If this is
    something you need for your work, please make a GitHub issue to start this
    process.

    Parameters
    ----------
    file: str or file-like
        File path or file-like of a valid polyline shapefile
    dbf: file-like, optional
        If file is file-like, the dbf must also be passed.
    store_polyline_vertices: bool, optional
        If True (default), store the vertices of the polylines in
        the at_link fields ``x_of_polyline`` and ``y_of_polyline``.
    points_shapefile: str or file-like
        File path or file-like of a valid point shapefile.
    points_dbf: file-like, optional
        If file is file-like, the dbf must also be passed.
    link_fields: list, optional
        List of polyline shapefile attributes to import as landlab at-link
        fields. Default is to import all.
    node_fields: list, optional
        List of point shapefile attributes to import as landlab at-node
        fields. Default is to import all.
    link_field_conversion: dict, optional
        Dictionary mapping polyline shapefile field names to desired at link
        field names. Default is no remapping.
    node_field_conversion: dict, optional
        Dictionary mapping node shapefile field names to desired at node field
        names. Default is no remapping.
    link_field_dtype: dict, optional
        Dictionary mapping node shapefile field names to desired dtype. Default
        is no change to dtype.
    node_field_dtype: dict, optional
        Dictionary mapping node shapefile field names to desired dtype. Default
        is no change to dtype.
    threshold: float, optional
        Maximum distance between a point in the point shapefile and a polyline
        junction in the polyline shapefile. Units are the same as in the
        shapefiles. Default is zero (requiring perfect overlap).

    Returns
    -------
    grid : NetworkModelGrid instance
        The network model grid will have nodes at the endpoints of the
        polylines, and links that connect these nodes. Any fields
        associated with the shapefile will be added as at-link fields. If a
        point shapefile is provided those values will be added as at-node
        fields.

    Examples
    --------
    First, we make a simple shapefile

    >>> from io import BytesIO
    >>> import os

    >>> import shapefile
    >>> shp = BytesIO()
    >>> shx = BytesIO()
    >>> dbf = BytesIO()
    >>> w = shapefile.Writer(shp=shp, shx=shx, dbf=dbf)
    >>> w.shapeType = shapefile.POLYLINE
    >>> w.field("spam", "N")
    >>> w.line([[[5, 5], [10, 10]]])
    >>> w.record(37)
    >>> w.line([[[5, 0], [5, 5]]])
    >>> w.record(100)
    >>> w.line([[[5, 5], [0, 10]]])
    >>> w.record(239)
    >>> w.close()

    Now create a NetworkModelGrid with read_shapefile:

    >>> from landlab.io import read_shapefile
    >>> grid = read_shapefile(shp, dbf=dbf)
    >>> grid.nodes
    array([0, 1, 2, 3])
    >>> grid.x_of_node
    array([  5.,   5.,   0.,  10.])
    >>> grid.y_of_node
    array([  0.,   5.,  10.,  10.])
    >>> grid.nodes_at_link
    array([[0, 1],
           [2, 1],
           [1, 3]])
    >>> assert "spam" in grid.at_link
    >>> grid.at_link["spam"]
    array([100, 239, 37])

    Next lets also include a points file. First create both shapefiles.

    >>> shp = BytesIO()
    >>> shx = BytesIO()
    >>> dbf = BytesIO()
    >>> w = shapefile.Writer(shp=shp, shx=shx, dbf=dbf)
    >>> w.shapeType = shapefile.POLYLINE
    >>> w.field("spam", "N")
    >>> w.line([[[5, 5], [10, 10]]])
    >>> w.record(37)
    >>> w.line([[[5, 0], [5, 5]]])
    >>> w.record(100)
    >>> w.line([[[5, 5], [0, 10]]])
    >>> w.record(239)
    >>> w.close()

    >>> p_shp = BytesIO()
    >>> p_shx = BytesIO()
    >>> p_dbf = BytesIO()
    >>> p_w = shapefile.Writer(shp=p_shp, shx=p_shx, dbf=p_dbf)
    >>> p_w.shapeType = shapefile.POINT
    >>> p_w.field("eggs", "N")
    >>> p_w.point(5, 0)
    >>> p_w.record(2)
    >>> p_w.point(5, 5)
    >>> p_w.record(4)
    >>> p_w.point(0, 10)
    >>> p_w.record(8)
    >>> p_w.point(10, 10)
    >>> p_w.record(6)
    >>> p_w.close()

    Now read in both files together.

    >>> grid = read_shapefile(shp, dbf=dbf, points_shapefile=p_shp, points_dbf=p_dbf)
    >>> grid.nodes
    array([0, 1, 2, 3])
    >>> grid.x_of_node
    array([  5.,   5.,   0.,  10.])
    >>> grid.y_of_node
    array([  0.,   5.,  10.,  10.])
    >>> grid.nodes_at_link
    array([[0, 1],
           [2, 1],
           [1, 3]])
    >>> assert "spam" in grid.at_link
    >>> grid.at_link["spam"]
    array([100, 239, 37])
    >>> assert "eggs" in grid.at_node
    >>> grid.at_node["eggs"]
    array([2, 4, 8, 6])
    """
    if node_fields is not None:
        node_fields = set(node_fields)
    if link_fields is not None:
        link_fields = set(link_fields)

    if not points_shapefile and node_fields:
        raise ValueError("node_fields is provided without a points shapefile")

    sf = _read_shapefile(file, dbf)

    link_field_conversion = link_field_conversion or {}
    node_field_conversion = node_field_conversion or {}
    link_field_dtype = link_field_dtype or {}
    node_field_dtype = node_field_dtype or {}

    if sf.shapeTypeName != "POLYLINE":
        raise ValueError(
            "landlab.io.shapefile read requires a polyline "
            "type shapefile. The provided shapefile does "
            "not meet these requirements."
        )

    if points_shapefile:
        psf = _read_shapefile(points_shapefile, points_dbf)
        if psf.shapeTypeName != "POINT":
            raise ValueError(
                "landlab.io.shapefile read requires a point "
                "type shapefile. The provided shapefile does "
                "not meet these requirements."
            )

    # get record information, the first element is ('DeletionFlag', 'C', 1, 0)
    # which we will ignore.
    records = sf.fields[1:]

    # initialize data structures for node (x,y) tuples,
    # link (head_node_id, tail_node_id) tuples, and a dictionary of at-link
    # fields.
    # besides the at-link fields on the shapefile, we'll also store an array of
    # x and y of the full polyline segment'.

    node_xy = []
    links = []
    fields = {rec[0]: [] for rec in records}

    # store which link fields to retain
    if link_fields is None:
        link_fields = set(fields.keys())

    if store_polyline_vertices:
        link_fields.update(["x_of_polyline", "y_of_polyline"])
        fields["x_of_polyline"] = []
        fields["y_of_polyline"] = []

    if not link_fields.issubset(fields):
        raise ValueError(
            "requested link fields to retain are not contained in the shapefile."
        )

    record_order = [rec[0] for rec in records]

    # iterate through shapes and records
    shapeRecs = sf.shapeRecords()
    for sr in shapeRecs:
        # if not a multi-part polyline:
        if len(sr.shape.parts) == 1:
            # get all the points on the polyline and deconstruct into x and y
            points = sr.shape.points
            x, y = zip(*points)

            # construct the (x,y) tuples of the head and tail nodes of each
            # polyline. Note here, that head and tail just refer to starting and
            # ending, they will be re-oriented if necessary by landlab.

            head_node_xy = (x[0], y[0])
            tail_node_xy = (x[-1], y[-1])

            # we should expect that the head node and tail node of later links will
            # already be part of the model grid. So we check, and add the nodes,
            # if they don't already exist.

            if head_node_xy not in node_xy:
                node_xy.append(head_node_xy)

            if tail_node_xy not in node_xy:
                node_xy.append(tail_node_xy)

            # get the index of the head and tail node index.
            head_node__node_id = node_xy.index(head_node_xy)
            tail_node__node_id = node_xy.index(tail_node_xy)

            # append the head and tail node ids to the link array
            links.append((head_node__node_id, tail_node__node_id))

            for i in range(len(sr.record)):
                field_name = record_order[i]
                fields[field_name].append(sr.record[i])

            if store_polyline_vertices:
                fields["x_of_polyline"].append(x)
                fields["y_of_polyline"].append(y)

        else:
            raise ValueError(
                (
                    "landlab.io.shapefile currently does not support ",
                    "reading multipart polyline shapefiles.",
                )
            )

    # Create a Network Model Grid.
    x_of_node, y_of_node = zip(*node_xy)

    # We want to ensure that we maintain sorting, so start by creating an
    # unsorted network graph and sorting.
    # The sorting is important to ensure that the fields are assigned to
    # the correct links.
    graph = NetworkGraph((y_of_node, x_of_node), links=links, sort=False)
    sorted_nodes, sorted_links, sorted_patches = graph.sort()

    # use the sorting information to make a new network model grid.
    grid = NetworkModelGrid(
        (np.asarray(y_of_node)[sorted_nodes], np.asarray(x_of_node)[sorted_nodes]),
        np.vstack((graph.node_at_link_head, graph.node_at_link_tail)).T,
    )

    # add values to fields.
    for field_name in link_fields:
        mapped_field_name = link_field_conversion.get(field_name, field_name)

        values = _convert_array(
            fields[field_name], dtype=link_field_dtype.get(field_name, None)
        )
        grid.at_link[mapped_field_name] = np.take(values, sorted_links)

    # if a points shapefile is added, bring in and use.
    if points_shapefile:
        # get ready to store fields.
        psf_records = psf.fields[1:]
        psf_record_order = [rec[0] for rec in psf_records]
        psf_fields = {rec[0]: [] for rec in psf_records}

        # store which node fields to retain
        if node_fields is None:
            node_fields = set(psf_fields)

        if not node_fields.issubset(psf_fields):
            raise ValueError(
                "requested node fields to retain are not contained in the shapefile."
            )

        # we don't need to store node xy, just need to store which index each
        # node maps to on the new grid.
        psf_node_mapping = np.full(grid.x_of_node.shape, -1, dtype=int)

        # loop through each node
        psf_shapeRecs = psf.shapeRecords()
        for node_idx, sr in enumerate(psf_shapeRecs):
            # find the closest
            point_x = sr.shape.points[0][0]
            point_y = sr.shape.points[0][1]
            x_diff = grid.x_of_node - point_x
            y_diff = grid.y_of_node - point_y

            dist = np.sqrt(x_diff**2 + y_diff**2)

            # check that the distance is small.
            if np.min(dist) > threshold:
                raise ValueError(
                    "landlab.io.shapefile: a point in the points shapefile "
                    f"is {np.min(dist)} away from the closet polyline junction in the "
                    "polyline shapefile. This is larger than the threshold"
                    f"value of {threshold}. This may mean that the threshold"
                    "or that something is wrong with the points file."
                )

            ind = np.nonzero(dist == np.min(dist))[0]
            # verify that there is only one closest.

            if psf_node_mapping[ind[0]] >= 0:
                raise ValueError(
                    "landlab.io.shapefile requires that the points file "
                    "have a 1-1 mapping to the polylines file. More than one "
                    "at-node point provided maps to the node with Landlab ID "
                    f"{ind[0]}, (x,y). This point has coordinates of ({point_x}, {point_y})"
                )

            psf_node_mapping[ind[0]] = node_idx

            for rec_idx in range(len(sr.record)):
                field_name = psf_record_order[rec_idx]
                psf_fields[field_name].append(sr.record[rec_idx])

        if np.any(psf_node_mapping < 0):
            raise ValueError(
                "landlab.io.shapefile requires that the points file "
                "contain the same number of points as polyline junctions. "
                "The points file contains fewer points than polyline junctions."
            )

        # add values to nodes.
        for field_name in node_fields:
            mapped_field_name = node_field_conversion.get(field_name, field_name)

            grid.at_node[mapped_field_name] = _infer_data_type(
                np.take(psf_fields[field_name], psf_node_mapping),
                dtype=node_field_dtype.get(field_name, None),
            )

    return grid



def _convert_array(values, dtype=None):
    try:
        array = np.asarray(values, dtype=dtype)
    except ValueError:
        is_jagged_array = True
    else:
        is_jagged_array = False

    if is_jagged_array:
        array = np.array(
            [_infer_data_type(value, dtype=dtype) for value in values],
            dtype=object,
        )
    return array