I've encountered behaviour which makes sense, but it is a bit confusing. Consider the following example (mirroring my real data).
This raises ValueError below because dask is not reading geometry column. And in that case, geopandas complains and tells you that you should use pandas.read_parquet
. I guess, that this is again something to be fixed in geopandas, but it does not happen using geopandas only and if it does, the error is meaningful. Which is not the case for dask_geopandas.
The workaround is to read parquet using dask.dataframe, but dask_geopandas should be able to resolve this under the hood.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-59-35ac7a0f8ffc> in <module>
----> 1 ddf['pop_est'].max().compute()
/opt/conda/lib/python3.7/site-packages/dask/base.py in compute(self, **kwargs)
165 dask.base.compute
166 """
--> 167 (result,) = compute(self, traverse=False, **kwargs)
168 return result
169
/opt/conda/lib/python3.7/site-packages/dask/base.py in compute(*args, **kwargs)
445 postcomputes.append(x.__dask_postcompute__())
446
--> 447 results = schedule(dsk, keys, **kwargs)
448 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
449
/opt/conda/lib/python3.7/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2686 should_rejoin = False
2687 try:
-> 2688 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
2689 finally:
2690 for f in futures.values():
/opt/conda/lib/python3.7/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
1986 direct=direct,
1987 local_worker=local_worker,
-> 1988 asynchronous=asynchronous,
1989 )
1990
/opt/conda/lib/python3.7/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
831 else:
832 return sync(
--> 833 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
834 )
835
/opt/conda/lib/python3.7/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
337 if error[0]:
338 typ, exc, tb = error[0]
--> 339 raise exc.with_traceback(tb)
340 else:
341 return result[0]
/opt/conda/lib/python3.7/site-packages/distributed/utils.py in f()
321 if callback_timeout is not None:
322 future = asyncio.wait_for(future, callback_timeout)
--> 323 result[0] = yield future
324 except Exception as exc:
325 error[0] = sys.exc_info()
/opt/conda/lib/python3.7/site-packages/tornado/gen.py in run(self)
733
734 try:
--> 735 value = future.result()
736 except Exception:
737 exc_info = sys.exc_info()
/opt/conda/lib/python3.7/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1845 exc = CancelledError(key)
1846 else:
-> 1847 raise exception.with_traceback(traceback)
1848 raise exc
1849 if errors == "skip":
/opt/conda/lib/python3.7/site-packages/dask/dataframe/io/parquet/core.py in read_parquet_part()
271 This function is used by `read_parquet`."""
272 if isinstance(part, list):
--> 273 dfs = [func(fs, rg, columns.copy(), index, **kwargs) for rg in part]
274 df = concat(dfs, axis=0)
275 else:
/opt/conda/lib/python3.7/site-packages/dask/dataframe/io/parquet/core.py in <listcomp>()
271 This function is used by `read_parquet`."""
272 if isinstance(part, list):
--> 273 dfs = [func(fs, rg, columns.copy(), index, **kwargs) for rg in part]
274 df = concat(dfs, axis=0)
275 else:
/opt/conda/lib/python3.7/site-packages/dask/dataframe/io/parquet/arrow.py in read_partition()
579
580 arrow_table = cls._parquet_piece_as_arrow(piece, columns, partitions, **kwargs)
--> 581 df = cls._arrow_table_to_pandas(arrow_table, categories, **kwargs)
582
583 # Note that `to_pandas(ignore_metadata=False)` means
/opt/conda/lib/python3.7/site-packages/dask_geopandas/io/parquet.py in _arrow_table_to_pandas()
40
41 # TODO support additional keywords
---> 42 return _arrow_to_geopandas(arrow_table)
43
44 @classmethod
/opt/conda/lib/python3.7/site-packages/geopandas/io/arrow.py in _arrow_to_geopandas()
338 """No geometry columns are included in the columns read from
339 the Parquet/Feather file. To read this file without geometry columns,
--> 340 use pandas.read_parquet/read_feather() instead."""
341 )
342
ValueError: No geometry columns are included in the columns read from
the Parquet/Feather file. To read this file without geometry columns,
use pandas.read_parquet/read_feather() instead.