docs: add apache parquet example (#476)

rhajek · web-flow · commit cdb92c8f9b43 · 2022-08-01T07:17:16.000+02:00
diff --git a/examples/README.md b/examples/README.md
@@ -10,6 +10,11 @@
 - [write_api_callbacks.py](write_api_callbacks.py) - How to handle batch events
 - [write_structured_data.py](write_structured_data.py) - How to write structured data - [NamedTuple](https://docs.python.org/3/library/collections.html#collections.namedtuple), [Data Classes](https://docs.python.org/3/library/dataclasses.html) - (_requires Python v3.8+_)
 - [logging_handler.py](logging_handler.py) - How to set up a python native logging handler that writes to InfluxDB
+- [import_parquet.py](import_parquet.py) - How to import [Apache Parquet](https://parquet.apache.org/) data files, 
+  the example requires: 
+  - manually download [NYC TLC Trip Record Data](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) 
+  - install Apache Arrow `pip install pyarrow` dependency
+
 
 ## Queries
 - [query.py](query.py) - How to query data into `FluxTable`s, `Stream` and `CSV`
diff --git a/examples/import_parquet.py b/examples/import_parquet.py
@@ -0,0 +1,49 @@
+import pyarrow.parquet as pq
+
+from influxdb_client import InfluxDBClient, WriteOptions
+
+with InfluxDBClient(url="http://localhost:8086", token="my-token", org="my-org", timeout=0, debug=False) as client:
+    """
+    You can download NYC TLC Trip Record Data parquet file from https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
+    """
+    table = pq.read_table('fhvhv_tripdata_2022-01.parquet')
+    with client.write_api(write_options=WriteOptions(batch_size=50_000)) as write_api:
+
+        dataframe = table.to_pandas()
+        """
+        Keep only interesting columns
+        """
+        keep_df = dataframe[
+            ['dispatching_base_num', "PULocationID", "DOLocationID", "pickup_datetime", "dropoff_datetime", "shared_request_flag"]]
+        print(keep_df.tail().to_string())
+
+        write_api.write(bucket="my-bucket", record=keep_df, data_frame_measurement_name="taxi-trip-data",
+                        data_frame_tag_columns=['dispatching_base_num', "shared_request_flag"],
+                        data_frame_timestamp_column="pickup_datetime")
+
+    """
+    Querying 10 pickups from dispatching 'B03404'
+    """
+    query = '''
+            from(bucket:"my-bucket")
+            |> range(start: 2022-01-01T00:00:00Z, stop: now()) 
+            |> filter(fn: (r) => r._measurement == "taxi-trip-data")
+            |> filter(fn: (r) => r.dispatching_base_num == "B03404")
+            |> pivot(rowKey:["_time"], columnKey: ["_field"], valueColumn: "_value")
+            |> rename(columns: {_time: "pickup_datetime"})
+            |> drop(columns: ["_start", "_stop"])
+            |> limit(n:10, offset: 0)
+            '''
+
+    result = client.query_api().query(query=query)
+
+    """
+    Processing results
+    """
+    print()
+    print("=== Querying 10 pickups from dispatching 'B03404' ===")
+    print()
+    for table in result:
+        for record in table.records:
+            print(
+                f'Dispatching: {record["dispatching_base_num"]} pickup: {record["pickup_datetime"]} dropoff: {record["dropoff_datetime"]}')