@@ -25,8 +25,9 @@ class PartitionInfo:
25
25
METADATA_DIR_COLUMN_NAME = "Dir"
26
26
METADATA_PIXEL_COLUMN_NAME = "Npix"
27
27
28
- def __init__ (self , pixel_list : List [HealpixPixel ]) -> None :
28
+ def __init__ (self , pixel_list : List [HealpixPixel ], catalog_base_dir : str = None ) -> None :
29
29
self .pixel_list = pixel_list
30
+ self .catalog_base_dir = catalog_base_dir
30
31
31
32
def get_healpix_pixels (self ) -> List [HealpixPixel ]:
32
33
"""Get healpix pixel objects for all pixels represented as partitions.
@@ -45,25 +46,55 @@ def get_highest_order(self) -> int:
45
46
max_pixel = np .max (self .pixel_list )
46
47
return max_pixel .order
47
48
48
- def write_to_file (self , partition_info_file : FilePointer , storage_options : dict = None ):
49
+ def write_to_file (
50
+ self ,
51
+ partition_info_file : FilePointer = None ,
52
+ catalog_path : FilePointer = None ,
53
+ storage_options : dict = None ,
54
+ ):
49
55
"""Write all partition data to CSV file.
50
56
57
+ If no paths are provided, the catalog base directory from the `read_from_dir` call is used.
58
+
51
59
Args:
52
60
partition_info_file: FilePointer to where the `partition_info.csv`
53
- file will be written
61
+ file will be written.
62
+ catalog_path: base directory for a catalog where the `partition_info.csv`
63
+ file will be written.
54
64
storage_options (dict): dictionary that contains abstract filesystem credentials
65
+
66
+ Raises:
67
+ ValueError: if no path is provided, and could not be inferred.
55
68
"""
69
+ if partition_info_file is None :
70
+ if catalog_path is not None :
71
+ partition_info_file = paths .get_partition_info_pointer (catalog_path )
72
+ elif self .catalog_base_dir is not None :
73
+ partition_info_file = paths .get_partition_info_pointer (self .catalog_base_dir )
74
+ else :
75
+ raise ValueError ("partition_info_file is required if info was not loaded from a directory" )
76
+
56
77
file_io .write_dataframe_to_csv (
57
78
self .as_dataframe (), partition_info_file , index = False , storage_options = storage_options
58
79
)
59
80
60
- def write_to_metadata_files (self , catalog_path : FilePointer , storage_options : dict = None ):
81
+ def write_to_metadata_files (self , catalog_path : FilePointer = None , storage_options : dict = None ):
61
82
"""Generate parquet metadata, using the known partitions.
62
83
84
+ If no catalog_path is provided, the catalog base directory from the `read_from_dir` call is used.
85
+
63
86
Args:
64
87
catalog_path (FilePointer): base path for the catalog
65
88
storage_options (dict): dictionary that contains abstract filesystem credentials
89
+
90
+ Raises:
91
+ ValueError: if no path is provided, and could not be inferred.
66
92
"""
93
+ if catalog_path is None :
94
+ if self .catalog_base_dir is None :
95
+ raise ValueError ("catalog_path is required if info was not loaded from a directory" )
96
+ catalog_path = self .catalog_base_dir
97
+
67
98
batches = [
68
99
[
69
100
pa .RecordBatch .from_arrays (
@@ -102,22 +133,39 @@ def read_from_dir(cls, catalog_base_dir: FilePointer, storage_options: dict = No
102
133
metadata_file = paths .get_parquet_metadata_pointer (catalog_base_dir )
103
134
partition_info_file = paths .get_partition_info_pointer (catalog_base_dir )
104
135
if file_io .does_file_or_directory_exist (partition_info_file , storage_options = storage_options ):
105
- partition_info = PartitionInfo .read_from_csv (partition_info_file , storage_options = storage_options )
136
+ pixel_list = PartitionInfo ._read_from_csv (partition_info_file , storage_options = storage_options )
106
137
elif file_io .does_file_or_directory_exist (metadata_file , storage_options = storage_options ):
107
138
warnings .warn ("Reading partitions from parquet metadata. This is typically slow." )
108
- partition_info = PartitionInfo .read_from_file (metadata_file , storage_options = storage_options )
139
+ pixel_list = PartitionInfo ._read_from_metadata_file (
140
+ metadata_file , storage_options = storage_options
141
+ )
109
142
else :
110
143
raise FileNotFoundError (
111
144
f"_metadata or partition info file is required in catalog directory { catalog_base_dir } "
112
145
)
113
- return partition_info
146
+ return cls ( pixel_list , catalog_base_dir )
114
147
115
148
@classmethod
116
149
def read_from_file (
117
150
cls , metadata_file : FilePointer , strict = False , storage_options : dict = None
118
151
) -> PartitionInfo :
119
152
"""Read partition info from a `_metadata` file to create an object
120
153
154
+ Args:
155
+ metadata_file (FilePointer): FilePointer to the `_metadata` file
156
+ storage_options (dict): dictionary that contains abstract filesystem credentials
157
+
158
+ Returns:
159
+ A `PartitionInfo` object with the data from the file
160
+ """
161
+ return cls (cls ._read_from_metadata_file (metadata_file , strict , storage_options ))
162
+
163
+ @classmethod
164
+ def _read_from_metadata_file (
165
+ cls , metadata_file : FilePointer , strict = False , storage_options : dict = None
166
+ ) -> List [HealpixPixel ]:
167
+ """Read partition info list from a `_metadata` file.
168
+
121
169
Args:
122
170
metadata_file (FilePointer): FilePointer to the `_metadata` file
123
171
storage_options (dict): dictionary that contains abstract filesystem credentials
@@ -163,14 +211,25 @@ def read_from_file(
163
211
## Remove duplicates, preserving order.
164
212
## In the case of association partition join info, we may have multiple entries
165
213
## for the primary order/pixels.
166
- pixel_list = list (dict .fromkeys (pixel_list ))
167
-
168
- return cls (pixel_list )
214
+ return list (dict .fromkeys (pixel_list ))
169
215
170
216
@classmethod
171
217
def read_from_csv (cls , partition_info_file : FilePointer , storage_options : dict = None ) -> PartitionInfo :
172
218
"""Read partition info from a `partition_info.csv` file to create an object
173
219
220
+ Args:
221
+ partition_info_file (FilePointer): FilePointer to the `partition_info.csv` file
222
+ storage_options (dict): dictionary that contains abstract filesystem credentials
223
+
224
+ Returns:
225
+ A `PartitionInfo` object with the data from the file
226
+ """
227
+ return cls (cls ._read_from_csv (partition_info_file , storage_options ))
228
+
229
+ @classmethod
230
+ def _read_from_csv (cls , partition_info_file : FilePointer , storage_options : dict = None ) -> PartitionInfo :
231
+ """Read partition info from a `partition_info.csv` file to create an object
232
+
174
233
Args:
175
234
partition_info_file (FilePointer): FilePointer to the `partition_info.csv` file
176
235
storage_options (dict): dictionary that contains abstract filesystem credentials
@@ -183,16 +242,14 @@ def read_from_csv(cls, partition_info_file: FilePointer, storage_options: dict =
183
242
184
243
data_frame = file_io .load_csv_to_pandas (partition_info_file , storage_options = storage_options )
185
244
186
- pixel_list = [
245
+ return [
187
246
HealpixPixel (order , pixel )
188
247
for order , pixel in zip (
189
248
data_frame [cls .METADATA_ORDER_COLUMN_NAME ],
190
249
data_frame [cls .METADATA_PIXEL_COLUMN_NAME ],
191
250
)
192
251
]
193
252
194
- return cls (pixel_list )
195
-
196
253
def as_dataframe (self ):
197
254
"""Construct a pandas dataframe for the partition info pixels.
198
255
0 commit comments