geoarrow-pyarrow#

Contains pyarrow integration for the geoarrow Python bindings.

Examples#

>>> import geoarrow.pyarrow as ga

Array constructors#

array(obj, type_=None, *args, **kwargs) → GeometryExtensionArray#

Attempt to create an Array or ChunkedArray with a geoarrow extension type from obj. This constructor attempts to perform the fewest transformations possible (i.e., WKB is left as WKB, WKT is left as WKT), whereas geoarrow.pyarrow.as_geoarrow() actively attempts a conversion to a geoarrow-encoding based on a common geometry type. GeoPandas objects are supported. This implementation relies heavily on pyarrow.array() and has similar behaviour.

>>> import geoarrow.pyarrow as ga
>>> ga.array(["POINT (0 1)"])
GeometryExtensionArray:WktType(geoarrow.wkt)[1]
<POINT (0 1)>
>>> ga.as_geoarrow(["POINT (0 1)"])
GeometryExtensionArray:PointType(geoarrow.point)[1]
<POINT (0 1)>

Type Constructors#

wkb() → WkbType#

Well-known binary with a maximum array size of 2 GB per chunk.

>>> import geoarrow.pyarrow as ga
>>> ga.wkb()
WkbType(geoarrow.wkb)
>>> ga.wkb().storage_type
DataType(binary)

wkt() → WktType#

Well-known text with a maximum array size of 2 GB per chunk.

>>> import geoarrow.pyarrow as ga
>>> ga.wkt()
WktType(geoarrow.wkt)
>>> ga.wkt().storage_type
DataType(string)

large_wkb() → WkbType#

Well-known binary using 64-bit integer offsets.

>>> import geoarrow.pyarrow as ga
>>> ga.large_wkb()
WkbType(geoarrow.wkb)
>>> ga.large_wkb().storage_type
DataType(large_binary)

large_wkt() → WktType#

Well-known text using 64-bit integer offsets.

>>> import geoarrow.pyarrow as ga
>>> ga.large_wkt()
WktType(geoarrow.wkt)
>>> ga.large_wkt().storage_type
DataType(large_string)

point() → PointType#

Geoarrow-encoded point features.

>>> import geoarrow.pyarrow as ga
>>> ga.point()
PointType(geoarrow.point)
>>> ga.point().storage_type
StructType(struct<x: double not null, y: double not null>)

linestring() → LinestringType#

Geoarrow-encoded line features.

>>> import geoarrow.pyarrow as ga
>>> ga.linestring()
LinestringType(geoarrow.linestring)
>>> ga.linestring().storage_type
ListType(list<vertices: struct<x: double not null, y: double not null> not null>)

polygon() → PolygonType#

Geoarrow-encoded polygon features.

>>> import geoarrow.pyarrow as ga
>>> ga.polygon()
PolygonType(geoarrow.polygon)
>>> ga.polygon().storage_type
ListType(list<vertices: list<rings: struct<x: double not null, y: double not null> not null> not null>)

multipoint() → MultiPointType#

Geoarrow-encoded multipoint features.

>>> import geoarrow.pyarrow as ga
>>> ga.multipoint()
MultiPointType(geoarrow.multipoint)
>>> ga.multipoint().storage_type
ListType(list<points: struct<x: double not null, y: double not null> not null>)

multilinestring() → MultiLinestringType#

Geoarrow-encoded multilinestring features.

>>> import geoarrow.pyarrow as ga
>>> ga.multilinestring()
MultiLinestringType(geoarrow.multilinestring)
>>> ga.multilinestring().storage_type
ListType(list<vertices: list<linestrings: struct<x: double not null, y: double not null> not null> not null>)

multipolygon() → MultiPolygonType#

Geoarrow-encoded polygon features.

>>> import geoarrow.pyarrow as ga
>>> ga.multipolygon()
MultiPolygonType(geoarrow.multipolygon)
>>> ga.multipolygon().storage_type
ListType(list<vertices: list<rings: list<polygons: struct<x: double not null, y: double not null> not null> not null> not null>)

Compute functions#

parse_all(obj)#

Parse all features and return nothing. This is useful for geoarrow.pyarrow.wkb() and geoarrow.pyarrow.wkt()-encoded arrays to validate their contents. For other types, this is a no-op.

>>> import geoarrow.pyarrow as ga
>>> ga.parse_all(["POINT (0 1)"])
>>> ga.parse_all(["POINT (0 1"])
Traceback (most recent call last):
 ...
geoarrow.c._lib.GeoArrowCException: GeoArrowKernel<visit_void_agg>::push_batch() failed (22): Expected ')' at byte 10

unique_geometry_types(obj)#

Compute unique geometry types from obj as a struct with columns geometry_type and dimensions. The values of these columns correspond to the values of the geoarrow.GeometryType and geoarrow.Dimensions enumerators.

>>> import geoarrow.pyarrow as ga
>>> print(str(ga.unique_geometry_types(["POINT Z (0 1 2)", "LINESTRING (0 0, 1 3)"])))
-- is_valid: all not null
-- child 0 type: int32
  [
    2,
    1
  ]
-- child 1 type: int32
  [
    1,
    2
  ]

infer_type_common(obj, coord_type=None, promote_multi=False, _geometry_types=None)#

Infer a common geoarrow.pyarrow.GeometryExtensionType for the geometries in obj, preferring geoarrow-encoded types and falling back to well-known binary.

>>> import geoarrow.pyarrow as ga
>>> ga.infer_type_common(["POINT Z (0 1 2)", "LINESTRING (0 0, 1 3)"])
WkbType(geoarrow.wkb)
>>> ga.infer_type_common(["POINT Z (0 1 2)", "MULTIPOINT (0 0, 1 3)"])
MultiPointType(geoarrow.multipoint_z)

as_wkt(obj)#

Encode obj as geoarrow.pyarrow.wkt().

>>> import geoarrow.pyarrow as ga
>>> points = ga.as_geoarrow(["POINT (0 1)"])
>>> ga.as_wkt(points)
GeometryExtensionArray:WktType(geoarrow.wkt)[1]
<POINT (0 1)>

as_wkb(obj, strict_iso_wkb=False)#

Encode obj as geoarrow.pyarrow.wkb().

>>> import geoarrow.pyarrow as ga
>>> points = ga.as_geoarrow(["POINT (0 1)"])
>>> ga.as_wkb(points)
GeometryExtensionArray:WkbType(geoarrow.wkb)[1]
<POINT (0 1)>

as_geoarrow(obj, type=None, coord_type=None, promote_multi=False)#

Encode obj as a geoarrow-encoded array, preferring geoarrow encodings and falling back to well-known binary if no common geoemtry type is found.

>>> import geoarrow.pyarrow as ga
>>> ga.as_geoarrow(["POINT (0 1)", "MULTIPOINT Z (0 1 2, 4 5 6)"])
GeometryExtensionArray:MultiPointType(geoarrow.multipoint_z)[2]
<MULTIPOINT Z (0 1 nan)>
<MULTIPOINT Z (0 1 2, 4 5 6)>

format_wkt(obj, precision=None, max_element_size_bytes=None)#

Format geometries in an object as well-known text with an optional cap on digits and element size to prevent excessive output for large features.

>>> import geoarrow.pyarrow as ga
>>> print(str(ga.format_wkt(ga.array(["POINT (0 1.3333333333333)"]), precision=5)))
[
  "POINT (0 1.33333)"
]
>>> print(str(ga.format_wkt(ga.array(["POINT (0 1)"]), max_element_size_bytes=3)))
[
  "POI"
]

box(obj)#

Compute a Cartesian 2D bounding box for each feature in obj as a struct(xmin, xmax, ymin, ymax) array.

>>> import geoarrow.pyarrow as ga
>>> ga.box(["LINESTRING (0 10, 34 -1)"]).type
BoxType(geoarrow.box)
>>> print(str(ga.box(["LINESTRING (0 10, 34 -1)"])))
-- is_valid: all not null
-- child 0 type: double
  [
    0
  ]
-- child 1 type: double
  [
    -1
  ]
-- child 2 type: double
  [
    34
  ]
-- child 3 type: double
  [
    10
  ]

box_agg(obj)#

Compute a Cartesian 2D bounding box for all features in obj as a scalar struct(xmin, xmax, ymin, ymax). Values that are null are currently ignored.

>>> import geoarrow.pyarrow as ga
>>> ga.box_agg(["POINT (0 10)", "POINT (34 -1)"])
BoxScalar({'xmin': 0.0, 'ymin': -1.0, 'xmax': 34.0, 'ymax': 10.0})

rechunk(obj, max_bytes)#

Split up chunks of obj into zero-copy slices with a maximum size of max_bytes. This may be useful to more predictibly parallelize a computation for variable feature sizes.

>>> import geoarrow.pyarrow as ga
>>> print(str(ga.rechunk(["POINT (0 1)", "POINT (2 3)"], max_bytes=100)))
[
  [
    "POINT (0 1)",
    "POINT (2 3)"
  ]
]
>>> print(str(ga.rechunk(["POINT (0 1)", "POINT (2 3)"], max_bytes=5)))
[
  [
    "POINT (0 1)"
  ],
  [
    "POINT (2 3)"
  ]
]

with_coord_type(obj, coord_type)#

Attempt to convert obj to a geoarrow-encoded array with a specific CoordType.

>>> import geoarrow.pyarrow as ga
>>> ga.with_coord_type(["POINT (0 1)"], ga.CoordType.INTERLEAVED)
GeometryExtensionArray:PointType(interleaved geoarrow.point)[1]
<POINT (0 1)>

with_edge_type(obj, edge_type)#

Force a geoarrow.EdgeType on an array.

>>> import geoarrow.pyarrow as ga
>>> ga.with_edge_type(["LINESTRING (0 1, 2 3)"], ga.EdgeType.SPHERICAL)
GeometryExtensionArray:WktType(spherical geoarrow.wkt)[1]
<LINESTRING (0 1, 2 3)>

with_crs(obj, crs)#

Force a geoarrow.CrsType/crs value on an array.

>>> import geoarrow.pyarrow as ga
>>> ga.with_crs(["POINT (0 1)"], ga.OGC_CRS84)
GeometryExtensionArray:WktType(geoarrow.wkt <ProjJsonCrs(OGC:CRS84)>)[1]
<POINT (0 1)>

with_dimensions(obj, dimensions)#

Attempt to convert obj to a geoarrow-encoded array with a specific geoarrow.Dimensions. If dimensions need to be added, nonexistent values will be filled with nan. If dimensions need to be dropped, this function will silently drop them. You can use geoarrow.pyarrow.unique_geometry_types() to efficiently detect if one or both of these will occur.

>>> import geoarrow.pyarrow as ga
>>> ga.with_dimensions(["POINT (0 1)"], ga.Dimensions.XYZM)
GeometryExtensionArray:PointType(geoarrow.point_zm)[1]
<POINT ZM (0 1 nan nan)>
>>> ga.with_dimensions(["POINT ZM (0 1 2 3)"], ga.Dimensions.XY)
GeometryExtensionArray:PointType(geoarrow.point)[1]
<POINT (0 1)>

with_geometry_type(obj, geometry_type)#

Attempt to convert obj to a geoarrow-encoded array with a specific geoarrow.GeometryType.

>>> import geoarrow.pyarrow as ga
>>> ga.with_geometry_type(["POINT (0 1)"], ga.GeometryType.MULTIPOINT)
GeometryExtensionArray:MultiPointType(geoarrow.multipoint)[1]
<MULTIPOINT (0 1)>
>>> ga.with_geometry_type(["MULTIPOINT (0 1)"], ga.GeometryType.POINT)
GeometryExtensionArray:PointType(geoarrow.point)[1]
<POINT (0 1)>
>>> ga.with_geometry_type(["LINESTRING EMPTY", "POINT (0 1)"], ga.GeometryType.POINT)
GeometryExtensionArray:PointType(geoarrow.point)[2]
<POINT (nan nan)>
<POINT (0 1)>
>>> ga.with_geometry_type(["MULTIPOINT (0 1, 2 3)"], ga.GeometryType.POINT)
Traceback (most recent call last):
  ...
geoarrow.c._lib.GeoArrowCException: GeoArrowKernel<as_geoarrow>::push_batch() failed (22): Can't convert feature with >1 coordinate to POINT

point_coords(obj, dimensions=None)#

Extract point coordinates into separate arrays or chunked arrays.

>>> import geoarrow.pyarrow as ga
>>> x, y = ga.point_coords(["POINT (0 1)", "POINT (2 3)"])
>>> list(x)
[<pyarrow.DoubleScalar: 0.0>, <pyarrow.DoubleScalar: 2.0>]
>>> list(y)
[<pyarrow.DoubleScalar: 1.0>, <pyarrow.DoubleScalar: 3.0>]

to_geopandas(obj)#

Convert a geoarrow-like array or table into a GeoSeries/DataFrame

These are thin wrappers around GeoSeries.from_arrow() and GeoDataFrame.from_arrow() where available, falling back on conversion through WKB if using an older version of GeoPandas or an Arrow array type that GeoPandas doesn’t support.

>>> import pyarrow as pa
>>> import geoarrow.pyarrow as ga
>>> array = ga.as_geoarrow(["POINT (0 1)"])
>>> ga.to_geopandas(array)
0    POINT (0 1)
dtype: geometry
>>> table = pa.table({"geometry": array})
>>> ga.to_geopandas(table)
      geometry
0  POINT (0 1)

Class Reference#

class GeometryExtensionType(spec: TypeSpec, *, storage_type=None, validate_storage_type=True)#

Extension type base class for vector geometry types.

property coord_type: CoordType#

The CoordType of this type.

>>> import geoarrow.pyarrow as ga
>>> ga.linestring().coord_type == ga.CoordType.SEPARATED
True
>>> ga.linestring().with_coord_type(ga.CoordType.INTERLEAVED).coord_type
<CoordType.INTERLEAVED: 2>

property crs: Crs | None#

The coordinate reference system of this type.

>>> import geoarrow.pyarrow as ga
>>> ga.point().with_crs(ga.OGC_CRS84).crs
ProjJsonCrs(OGC:CRS84)

property dimensions: Dimensions#

The Dimensions of this type or UNKNOWN for types where this is not constant (i.e., WKT and WKT).

>>> import geoarrow.pyarrow as ga
>>> ga.wkb().dimensions == ga.Dimensions.UNKNOWN
True
>>> ga.linestring().dimensions == ga.Dimensions.XY
True

property edge_type: EdgeType#

The EdgeType of this type.

>>> import geoarrow.pyarrow as ga
>>> ga.linestring().edge_type == ga.EdgeType.PLANAR
True
>>> ga.linestring().with_edge_type(ga.EdgeType.SPHERICAL).edge_type
<EdgeType.SPHERICAL: 2>

from_geobuffers(*args, **kwargs)#: Create an array from the appropriate number of buffers for this type.

property geometry_type: GeometryType#

The GeometryType of this type or GEOMETRY for types where this is not constant (i.e., WKT and WKB).

>>> import geoarrow.pyarrow as ga
>>> ga.wkb().geometry_type == ga.GeometryType.GEOMETRY
True
>>> ga.linestring().geometry_type == ga.GeometryType.LINESTRING
True

to_pandas_dtype(self)#

Return the equivalent NumPy / Pandas dtype.

Examples#

>>> import pyarrow as pa
>>> pa.int64().to_pandas_dtype()
<class 'numpy.int64'>

with_coord_type(coord_type)#: Returns a new type with the specified geoarrow.CoordType. >>> import geoarrow.pyarrow as ga >>> ga.point().with_coord_type(ga.CoordType.INTERLEAVED) PointType(interleaved geoarrow.point)

with_crs(crs)#: Returns a new type with the specified coordinate reference system geoarrow.CrsType combination. >>> import geoarrow.pyarrow as ga >>> ga.linestring().with_crs(ga.OGC_CRS84) LinestringType(geoarrow.linestring <ProjJsonCrs(OGC:CRS84)>)

with_dimensions(dimensions)#: Returns a new type with the specified geoarrow.Dimensions. >>> import geoarrow.pyarrow as ga >>> ga.point().with_dimensions(ga.Dimensions.XYZ) PointType(geoarrow.point_z)

with_edge_type(edge_type)#: Returns a new type with the specified geoarrow.EdgeType. >>> import geoarrow.pyarrow as ga >>> ga.linestring().with_edge_type(ga.EdgeType.SPHERICAL) LinestringType(spherical geoarrow.linestring)

with_geometry_type(geometry_type)#: Returns a new type with the specified geoarrow.GeometryType. >>> import geoarrow.pyarrow as ga >>> ga.point().with_geometry_type(ga.GeometryType.LINESTRING) LinestringType(geoarrow.linestring)

with_metadata(metadata)#: This type with the extension metadata (e.g., copied from some other type) >>> import geoarrow.pyarrow as ga >>> ga.linestring().with_metadata(‘{“edges”: “spherical”}’).edge_type <EdgeType.SPHERICAL: 2>

wrap_array(self, storage)#

Wrap the given storage array as an extension array.

Parameters#

storage : Array or ChunkedArray

Returns#

arrayArray or ChunkedArray: Extension array wrapping the storage array

class WkbType(spec: TypeSpec, *, storage_type=None, validate_storage_type=True)#: Extension type whose storage is a binary or large binary array of well-known binary. Even though the draft specification currently mandates ISO well-known binary, EWKB is supported by the parser.

class WktType(spec: TypeSpec, *, storage_type=None, validate_storage_type=True)#: Extension type whose storage is a utf8 or large utf8 array of well-known text.

class PointType(spec: TypeSpec, *, storage_type=None, validate_storage_type=True)#

Extension type whose storage is an array of points stored as either a struct with one child per dimension or a fixed-size list whose single child is composed of interleaved ordinate values.

from_geobuffers(validity, x, y=None, z_or_m=None, m=None)#: Create an array from the appropriate number of buffers for this type.

class LinestringType(spec: TypeSpec, *, storage_type=None, validate_storage_type=True)#

Extension type whose storage is an array of linestrings stored as a list of points as described in PointType.

from_geobuffers(validity, coord_offsets, x, y=None, z_or_m=None, m=None)#: Create an array from the appropriate number of buffers for this type.

class PolygonType(spec: TypeSpec, *, storage_type=None, validate_storage_type=True)#

Extension type whose storage is an array of polygons stored as a list of a list of points as described in PointType.

from_geobuffers(validity, ring_offsets, coord_offsets, x, y=None, z_or_m=None, m=None)#: Create an array from the appropriate number of buffers for this type.

class MultiPointType(spec: TypeSpec, *, storage_type=None, validate_storage_type=True)#

Extension type whose storage is an array of polygons stored as a list of points as described in PointType.

from_geobuffers(validity, coord_offsets, x, y=None, z_or_m=None, m=None)#: Create an array from the appropriate number of buffers for this type.

class MultiLinestringType(spec: TypeSpec, *, storage_type=None, validate_storage_type=True)#

Extension type whose storage is an array of multilinestrings stored as a list of a list of points as described in PointType.

from_geobuffers(validity, linestring_offsets, coord_offsets, x, y=None, z_or_m=None, m=None)#: Create an array from the appropriate number of buffers for this type.

class MultiPolygonType(spec: TypeSpec, *, storage_type=None, validate_storage_type=True)#

Extension type whose storage is an array of multilinestrings stored as a list of a list of a list of points as described in PointType.

from_geobuffers(validity, polygon_offsets, ring_offsets, coord_offsets, x, y=None, z_or_m=None, m=None)#: Create an array from the appropriate number of buffers for this type.

IO helpers#

GeoArrow IO helpers

A module wrapping IO functionality from external libraries to simplify testing and documenting the GeoArrow format and encodings.

>>> from geoarrow.pyarrow import io

read_pyogrio_table(*args, **kwargs)#

Read a file using GDAL/OGR

Reads a file as a pyarrow.Table using pyogrio.raw.read_arrow(). This does not parse the input, which OGR returns as geoarrow.pyarrow.wkb().

>>> from geoarrow.pyarrow import io
>>> import tempfile
>>> import geopandas
>>> import os
>>> with tempfile.TemporaryDirectory() as tmpdir:
...     temp_gpkg = os.path.join(tmpdir, "test.gpkg")
...     geopandas.GeoDataFrame(
...         geometry=geopandas.GeoSeries.from_wkt(["POINT (0 1)"],
...         crs="OGC:CRS84")
...     ).to_file(temp_gpkg)
...     table = io.read_pyogrio_table(temp_gpkg)
...     table.column("geom").chunk(0)
GeometryExtensionArray:WkbType(geoarrow.wkb <...>)[1]
<POINT (0 1)>

read_geoparquet_table(*args, **kwargs)#

Read GeoParquet using PyArrow

A thin wrapper around pyarrow.parquet.read_parquet() that ensures any columns marked as geometry are encoded as extension arrays. This will read Parquet files with and without the geo metadata key as described in the GeoParquet format and guidance on compatible Parquet. Briefly, this means that any valid GeoParquet file (i.e., written by GDAL or write_geoparquet_table()) or regular Parquet file with a column named geometry can be read by read_geoparquet_table(). For regular Parquet files, a geometry column encoded as binary is assumed to contain well-known binary and a geometry column encoded as text is assumed to contain well-known text.

Because a pyarrow.Table has no concept of “primary geometry column”, that information is currently lost on read. To minimize the chances of misinterpreting the primary geometry column, use geometry as the name of the column or refer to columns explicitly.

>>> import geoarrow.pyarrow as ga
>>> from geoarrow.pyarrow import io
>>> import tempfile
>>> import os
>>> import pyarrow as pa
>>> with tempfile.TemporaryDirectory() as tmpdir:
...     temp_pq = os.path.join(tmpdir, "test.parquet")
...     tab = pa.table([ga.array(["POINT (0 1)"])], names=["geometry"])
...     io.write_geoparquet_table(tab, temp_pq)
...     tab2 = io.read_geoparquet_table(temp_pq)
...     tab2["geometry"].chunk(0)
GeometryExtensionArray:WkbType(geoarrow.wkb)[1]
<POINT (0 1)>

write_geoparquet_table(table, *args, primary_geometry_column=None, geometry_columns=None, write_bbox=False, write_geometry_types=None, check_wkb=True, geometry_encoding='WKB', **kwargs)#

Write GeoParquet using PyArrow

Writes a Parquet file with the geo metadata key used by GeoParquet readers to recreate geometry types. Note that if you are writing Parquet that will be read by an Arrow-based Parquet reader and a GeoArrow implementation, using pyarrow.parquet.write_parquet() will preserve geometry types/metadata and is usually faster.

Note that passing write_bbox=True and/or write_geometry_types=True may be computationally expensive for large input. Use write_geometry_types=False` to force omitting geometry types even when this value is type-constant.

See read_geoparquet_table() for examples.

Dataset constructors#

Experimental geospatial-agumented wrapper around a pyarrow.dataset.

>>> import geoarrow.pyarrow.dataset as gads

dataset(*args, geometry_columns=None, use_row_groups=None, **kwargs)#

Construct a GeoDataset

This constructor is intended to mirror pyarrow.dataset(), adding geo-specific arguments. See geoarrow.pyarrow.dataset.GeoDataset for details.

>>> import geoarrow.pyarrow.dataset as gads
>>> import geoarrow.pyarrow as ga
>>> import pyarrow as pa
>>> table = pa.table([ga.array(["POINT (0.5 1.5)"])], ["geometry"])
>>> dataset = gads.dataset(table)

class GeoDataset(parent, geometry_columns=None)#

Geospatial-augmented Dataset

EXPERIMENTAL

The GeoDataset wraps a pyarrow.Dataset containing one or more geometry columns and provides indexing and IO capability. If geometry_columns is None, it will include all columns that inherit from geoarrow.pyarrow.GeometryExtensionType. The geometry_columns are not required to be geoarrow extension type columns: text columns will be parsed as WKT; binary columns will be parsed as WKB (but are not detected automatically).

Note that the GeoDataset is only useful in a context where each fragment has been written such that features in the fragment are close together in space (e.g., one file or row group per state).

filter_fragments(target)#

Push down a spatial query into a GeoDataset

Returns a potentially simplified dataset based on the geometry of target. Currently this uses geoarrow.pyarrow.box_agg() on target and performs a simple envelope comparison with each fragment. A future implementation may handle spherical edges using a type of simplified geometry more suitable to a spherical comparison. For datasets with more than one geometry column, the filter will be applied to all columns and include fragments that intersect the simplified geometry from any of the columns.

Note that datasets with large row groups/fragments and/or datasets that were not written with fragments with spatial significance may return most or all of the fragments in the parent dataset.

>>> import geoarrow.pyarrow.dataset as gads
>>> import geoarrow.pyarrow as ga
>>> import pyarrow as pa
>>> table = pa.table([ga.array(["POINT (0.5 1.5)"])], ["geometry"])
>>> dataset = gads.dataset(table)
>>> dataset.filter_fragments("POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))").to_table()
pyarrow.Table
geometry: extension<geoarrow.wkt<WktType>>
----
geometry: []
>>> dataset.filter_fragments("POLYGON ((0 1, 0 2, 1 2, 1 1, 0 1))").to_table()
pyarrow.Table
geometry: extension<geoarrow.wkt<WktType>>
----
geometry: [["POINT (0.5 1.5)"]]

property geometry_columns#

Get a tuple of geometry column names

>>> import geoarrow.pyarrow.dataset as gads
>>> import geoarrow.pyarrow as ga
>>> import pyarrow as pa
>>> table = pa.table([ga.array(["POINT (0.5 1.5)"])], ["geometry"])
>>> dataset = gads.dataset(table)
>>> dataset.geometry_columns
('geometry',)

property geometry_types#

Resolve a tuple of geometry column types

This will convert any primitive types to the corresponding geo-enabled type (e.g., binary to wkb) and check that geometry columns actually refer a field that can be interpreted as geometry.

>>> import geoarrow.pyarrow.dataset as gads
>>> import geoarrow.pyarrow as ga
>>> import pyarrow as pa
>>> table = pa.table([ga.array(["POINT (0.5 1.5)"])], ["geometry"])
>>> dataset = gads.dataset(table)
>>> dataset.geometry_types
(WktType(geoarrow.wkt),)

get_fragments()#

Resolve the list of fragments in the dataset

This is identical to the list of fragments of its parent.

index_fragments(num_threads=None)#

Resolve a simplified geometry for each fragment

Currently the simplified geometry is a box in the form of a struct array with fields xmin, xmax, ymin, and ymax. The fragment index is curently a table whose first column is the fragment index and whose subsequent columns are named with the geometry column name. A future implementation may handle spherical edges using a type of simplified geometry more suitable to a spherical comparison.

>>> import geoarrow.pyarrow.dataset as gads
>>> import geoarrow.pyarrow as ga
>>> import pyarrow as pa
>>> table = pa.table([ga.array(["POINT (0.5 1.5)"])], ["geometry"])
>>> dataset = gads.dataset(table)
>>> dataset.index_fragments().to_pylist()
[{'_fragment_index': 0, 'geometry': {'xmin': 0.5, 'ymin': 1.5, 'xmax': 0.5, 'ymax': 1.5}}]

property parent#

Get the parent Dataset

Returns the (non geo-aware) parent pyarrow.Dataset.

>>> import geoarrow.pyarrow.dataset as gads
>>> import geoarrow.pyarrow as ga
>>> import pyarrow as pa
>>> table = pa.table([ga.array(["POINT (0.5 1.5)"])], ["geometry"])
>>> dataset = gads.dataset(table)
>>> type(dataset.parent)
<class 'pyarrow._dataset.InMemoryDataset'>

property schema#

Get the dataset schema

The schema of a GeoDataset is identical to that of its parent.

>>> import geoarrow.pyarrow.dataset as gads
>>> import geoarrow.pyarrow as ga
>>> import pyarrow as pa
>>> table = pa.table([ga.array(["POINT (0.5 1.5)"])], ["geometry"])
>>> dataset = gads.dataset(table)
>>> dataset.schema
geometry: extension<geoarrow.wkt<WktType>>

class ParquetRowGroupGeoDataset(parent, row_group_fragments, row_group_ids, geometry_columns=None, use_column_statistics=True)#

Geospatial-augmented Parquet dataset using row groups

An implementation of the GeoDataset that can leverage potentially more efficient indexing and more specific filtering. Notably, this implementation can (1) split a Parquet dataset into potentially more smaller fragments and (2) use column statistics added by most Parquet writers to more efficiently build the fragment index for types that support this capability.

geoarrow-pyarrow#

Examples#

Array constructors#

Type Constructors#

Compute functions#

Class Reference#

Examples#

Parameters#

Returns#

IO helpers#

Dataset constructors#

This Page