diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..c6c6788 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +global-exclude *test.py \ No newline at end of file diff --git a/README.md b/README.md index 127f375..ed241a1 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Feel free to do any analysis you wish. For example: .plot() ) ``` -![Coal vs Wind in the US since 1940](demo.png) +![Coal vs Wind in the US since 1940](https://raw.githubusercontent.com/alxmrs/dask-ee/main/demo.png) There are a few other useful things you can do. @@ -81,6 +81,11 @@ df.head() Contributions are welcome. A good way to start is to check out open [issues](https://github.com/alxmrs/dask-ee/issues) or file a new one. We're happy to review pull requests, too. +Before writing code, please install the development dependencies (after cloning the repo): +```shell +pip install -e ".[dev]" +``` + ## License ``` Copyright 2024 Alexander S Merose diff --git a/dask_ee/read.py b/dask_ee/read.py index d9f7ebf..6077353 100644 --- a/dask_ee/read.py +++ b/dask_ee/read.py @@ -26,8 +26,6 @@ } -# TODO(#4): Support 'auto' chunks, where we calculate the maximum allowed page size given the number of -# bytes in each row. def read_ee( fc: t.Union[ee.FeatureCollection, str], chunksize: t.Union[int, t.Literal['auto']] = 5_000, @@ -41,25 +39,27 @@ def read_ee( Returns: A dask DataFrame with paged Google Earth Engine data. """ + # TODO(#4): Support 'auto' chunks, where we calculate the maximum allowed page size given the number of + # bytes in each row. + if chunksize == 'auto': + raise NotImplementedError('Auto chunksize is not implemented yet!') if isinstance(fc, str): fc = ee.FeatureCollection(fc) - if chunksize == 'auto': - raise NotImplementedError('Auto chunksize is not implemented yet!') - # Make all the getInfo() calls at once, up front. fc_size, all_info = ee.List([fc.size(), fc.limit(0)]).getInfo() columns = {'geo': 'Json'} columns.update(all_info['columns']) - del columns['system:index'] + if 'system:index' in columns: + del columns['system:index'] divisions = tuple(range(0, fc_size, chunksize)) # TODO(#5): Compare `toList()` to other range operations, like getting all index IDs via `getInfo()`. pages = [ee.FeatureCollection(fc.toList(chunksize, i)) for i in divisions] - # Get the remainder, if it exists. `io_chunks` are not likely to evenly partition the data. + # Get the remainder, if it exists. `chunksize` is not likely to evenly partition the data. d, r = divmod(fc_size, chunksize) if r != 0: pages.append(ee.FeatureCollection(fc.toList(r, d))) diff --git a/dask_ee/read_integrationtest.py b/dask_ee/read_integrationtest.py index ca794f6..d580f73 100644 --- a/dask_ee/read_integrationtest.py +++ b/dask_ee/read_integrationtest.py @@ -23,28 +23,66 @@ def setUpClass(cls): ee.Initialize() def test_reads_dask_dataframe(self): - fc = ee.FeatureCollection("WRI/GPPD/power_plants") - ddf = dask_ee.read_ee(fc) + fc = ee.FeatureCollection('WRI/GPPD/power_plants') + df = dask_ee.read_ee(fc) - head = ddf.head() - columns = ddf.columns + head = df.head() + columns = df.columns - self.assertIsNotNone(ddf) + self.assertIsNotNone(df) self.assertIsNotNone(head) - self.assertIsInstance(ddf, dd.DataFrame) - self.assertEqual(ddf.compute().shape, (28_664, 23)) + self.assertIsInstance(df, dd.DataFrame) + self.assertEqual(df.compute().shape, (28_664, 23)) print(columns) print(head) + def test_works_with_defined_features(self): + # Make a list of Features. + features = [ + ee.Feature( + ee.Geometry.Rectangle(30.01, 59.80, 30.59, 60.15), + {'name': 'Voronoi'}, + ), + ee.Feature(ee.Geometry.Point(-73.96, 40.781), {'name': 'Thiessen'}), + ee.Feature(ee.Geometry.Point(6.4806, 50.8012), {'name': 'Dirichlet'}), + ] + + fc = ee.FeatureCollection(features) + + df = dask_ee.read_ee(fc) + + self.assertEqual(list(df.columns), ['geo', 'name']) + + def test_works_with_a_single_feature_in_fc(self): + from_geom = ee.FeatureCollection(ee.Geometry.Point(16.37, 48.225)) + + df = dask_ee.read_ee(from_geom) + + self.assertEqual(list(df.columns), ['geo']) + self.assertEqual(df.compute().shape, (1, 1)) + + def test_can_create_random_points(self): + # Define an arbitrary region in which to compute random points. + region = ee.Geometry.Rectangle(-119.224, 34.669, -99.536, 50.064) + + # Create 1000 random points in the region. + random_points = ee.FeatureCollection.randomPoints(region) + + # Note: these random points have no system:index! + df = dask_ee.read_ee(random_points) + + self.assertEqual(list(df.columns), ['geo']) + self.assertEqual(df.compute().shape, (1000, 1)) + def test_prof__read_ee(self): - fc = ee.FeatureCollection("WRI/GPPD/power_plants") + fc = ee.FeatureCollection('WRI/GPPD/power_plants') with cProfile.Profile() as pr: _ = dask_ee.read_ee(fc) # Modified version of `pr.print_stats()`. - pstats.Stats(pr).sort_stats("cumtime").print_stats() + pstats.Stats(pr).sort_stats('cumtime').print_stats() -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/dask_ee/read_test.py b/dask_ee/read_test.py index 1e4b17f..28b6e1b 100644 --- a/dask_ee/read_test.py +++ b/dask_ee/read_test.py @@ -9,6 +9,12 @@ def test_can_import_read_op(self): except ModuleNotFoundError: self.fail('Cannot import `read_ee` function.') + def test_rejects_auto_chunks(self): + import dask_ee + + with self.assertRaises(NotImplementedError): + dask_ee.read_ee('WRI/GPPD/power_plants', 'auto') + if __name__ == '__main__': unittest.main() diff --git a/pyproject.toml b/pyproject.toml index a0b19f2..2b239b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,10 @@ tests = [ "pytest", "pyink", ] +dev = [ + "dask-ee[tests]", + "build", +] [project.urls] Homepage = "https://github.com/alxmrs/dask-ee"