diff --git a/configs/m4_shards.yaml b/configs/m4_shards.yaml new file mode 100644 index 00000000..6fafefcd --- /dev/null +++ b/configs/m4_shards.yaml @@ -0,0 +1,198 @@ +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/0/{00000..00346}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/1/{00000..00384}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/10/{00000..00347}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/100/{00000..00365}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/101/{00000..00366}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/102/{00000..00345}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/103/{00000..00380}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/104/{00000..00355}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/105/{00000..00362}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/106/{00000..00368}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/107/{00000..00347}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/108/{00000..00377}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/109/{00000..00357}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/11/{00000..00375}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/110/{00000..00357}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/111/{00000..00371}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/112/{00000..00348}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/113/{00000..00374}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/114/{00000..00359}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/115/{00000..00353}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/116/{00000..00374}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/117/{00000..00349}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/118/{00000..00370}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/119/{00000..00361}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/12/{00000..00358}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/120/{00000..00348}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/121/{00000..00377}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/122/{00000..00352}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/123/{00000..00366}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/124/{00000..00365}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/125/{00000..00344}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/126/{00000..00381}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/127/{00000..00354}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/128/{00000..00363}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/129/{00000..00367}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/13/{00000..00355}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/130/{00000..00346}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/131/{00000..00377}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/132/{00000..00356}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/133/{00000..00359}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/134/{00000..00370}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/135/{00000..00347}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/136/{00000..00374}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/137/{00000..00358}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/138/{00000..00354}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/139/{00000..00373}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/14/{00000..00372}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/140/{00000..00349}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/141/{00000..00371}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/142/{00000..00360}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/143/{00000..00349}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/144/{00000..00376}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/145/{00000..00351}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/146/{00000..00368}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/147/{00000..00363}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/148/{00000..00345}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/149/{00000..00380}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/15/{00000..00348}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/150/{00000..00353}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/151/{00000..00363}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/152/{00000..00366}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/153/{00000..00344}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/154/{00000..00379}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/155/{00000..00355}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/156/{00000..00360}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/157/{00000..00368}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/158/{00000..00347}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/159/{00000..00376}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/16/{00000..00371}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/160/{00000..00357}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/161/{00000..00355}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/162/{00000..00371}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/163/{00000..00348}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/164/{00000..00371}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/165/{00000..00360}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/166/{00000..00350}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/167/{00000..00376}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/168/{00000..00350}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/169/{00000..00368}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/17/{00000..00360}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/170/{00000..00362}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/171/{00000..00345}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/172/{00000..00379}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/173/{00000..00352}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/174/{00000..00364}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/175/{00000..00365}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/176/{00000..00345}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/177/{00000..00379}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/178/{00000..00355}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/179/{00000..00360}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/18/{00000..00351}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/180/{00000..00368}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/181/{00000..00346}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/182/{00000..00376}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/183/{00000..00357}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/185/{00000..00371}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/186/{00000..00348}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/187/{00000..00372}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/188/{00000..00359}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/19/{00000..00376}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/190/{00000..00375}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/191/{00000..00350}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/192/{00000..00368}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/193/{00000..00363}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/194/{00000..00345}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/195/{00000..00379}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/196/{00000..00352}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/197/{00000..00365}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/198/{00000..00365}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/199/{00000..00344}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/2/{00000..00354}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/20/{00000..00351}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/21/{00000..00367}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/22/{00000..00362}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/23/{00000..00343}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/24/{00000..00375}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/25/{00000..00347}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/26/{00000..00359}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/27/{00000..00361}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/28/{00000..00340}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/29/{00000..00374}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/3/{00000..00364}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/30/{00000..00349}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/31/{00000..00355}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/32/{00000..00363}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/33/{00000..00341}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/34/{00000..00372}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/35/{00000..00353}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/36/{00000..00355}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/37/{00000..00371}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/38/{00000..00348}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/39/{00000..00373}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/4/{00000..00367}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/40/{00000..00360}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/41/{00000..00352}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/42/{00000..00375}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/43/{00000..00350}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/44/{00000..00369}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/45/{00000..00362}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/46/{00000..00347}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/47/{00000..00378}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/48/{00000..00353}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/49/{00000..00366}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/5/{00000..00345}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/50/{00000..00365}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/51/{00000..00344}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/52/{00000..00380}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/53/{00000..00354}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/54/{00000..00361}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/55/{00000..00367}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/56/{00000..00345}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/57/{00000..00377}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/58/{00000..00355}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/59/{00000..00358}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/6/{00000..00378}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/60/{00000..00370}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/61/{00000..00347}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/62/{00000..00373}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/63/{00000..00358}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/64/{00000..00353}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/65/{00000..00372}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/66/{00000..00348}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/67/{00000..00370}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/68/{00000..00361}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/69/{00000..00348}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/7/{00000..00356}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/70/{00000..00377}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/71/{00000..00351}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/72/{00000..00367}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/73/{00000..00363}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/74/{00000..00343}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/75/{00000..00381}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/76/{00000..00353}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/77/{00000..00363}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/78/{00000..00365}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/79/{00000..00345}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/8/{00000..00359}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/80/{00000..00378}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/81/{00000..00355}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/82/{00000..00359}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/83/{00000..00369}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/84/{00000..00346}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/85/{00000..00375}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/86/{00000..00357}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/87/{00000..00356}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/88/{00000..00372}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/89/{00000..00349}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/9/{00000..00369}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/90/{00000..00373}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/91/{00000..00360}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/92/{00000..00352}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/93/{00000..00376}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/94/{00000..00351}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/95/{00000..00369}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/96/{00000..00363}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/97/{00000..00346}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/98/{00000..00380}.tar -' +- 'pipe:aws s3 cp s3://muse-datasets/m4-datasets-laion-dataset-filtered-dedup-joined-with-stability-metadata-laicov2/99/{00000..00352}.tar -' diff --git a/training/data.py b/training/data.py index b56db6fb..35d2188e 100644 --- a/training/data.py +++ b/training/data.py @@ -24,6 +24,7 @@ from typing import List, Optional, Union import webdataset as wds +import yaml from braceexpand import braceexpand from torch.utils.data import default_collate from torchvision import transforms @@ -94,7 +95,20 @@ def get_orig_size(json): def get_aesthetic_score(json): - return float(json.get("AESTHETIC_SCORE", 0.0)) + if "aesthetic" in json: + a = json["aesthetic"] + elif "AESTHETIC_SCORE" in json: + a = json["AESTHETIC_SCORE"] + elif "aesthetic_score_laion_v2" in json: + a = json["aesthetic_score_laion_v2"] + elif "stability_metadata" in json and "aes_scorelv2" in json["stability_metadata"]: + a = json["stability_metadata"]["aes_scorelv2"] + else: + a = 0.0 + + a = float(a) + + return a class ImageNetTransform: @@ -254,37 +268,146 @@ def eval_dataloader(self): return self._eval_dataloader -# taken from https://github.com/dome272/Paella/blob/main/src_distributed/utils.py#L20 -class WebdatasetFilter: - def __init__(self, min_size=256, max_pwatermark=0.5, aesthetic_threshold=4.9): +class WebdatasetSelect: + def __init__( + self, + min_size=256, + max_pwatermark=0.5, + min_aesthetic_score=4.9, + require_marked_as_ok_by_spawning=False, + require_marked_as_not_getty=False, + max_pnsfw=None, + ): self.min_size = min_size self.max_pwatermark = max_pwatermark - self.aesthetic_threshold = aesthetic_threshold + self.min_aesthetic_score = min_aesthetic_score + self.require_marked_as_ok_by_spawning = require_marked_as_ok_by_spawning + self.require_marked_as_not_getty = require_marked_as_not_getty + self.max_pnsfw = max_pnsfw def __call__(self, x): + if "json" not in x: + return False try: - if "json" in x: - x_json = json.loads(x["json"]) - filter_size = (x_json.get("original_width", 0.0) or 0.0) >= self.min_size and x_json.get( - "original_height", 0 - ) >= self.min_size - filter_watermark = (x_json.get("pwatermark", 1.0) or 1.0) <= self.max_pwatermark - filter_watermark_coyo = (x_json.get("watermark_score", 1.0) or 1.0) <= self.max_pwatermark - filter_aesthetic_a = (x_json.get("aesthetic", 0.0) or 0.0) >= self.aesthetic_threshold - filter_aesthetic_b = (x_json.get("AESTHETIC_SCORE", 0.0) or 0.0) >= self.aesthetic_threshold - filter_aesthetic_coyo = ( - x_json.get("aesthetic_score_laion_v2", 0.0) or 0.0 - ) >= self.aesthetic_threshold - return ( - filter_size - and (filter_watermark or filter_watermark_coyo) - and (filter_aesthetic_a or filter_aesthetic_b or filter_aesthetic_coyo) - ) - else: - return False + x_json = json.loads(x["json"]) except: return False + # For all requirements, if the necessary key(s) are not present, we assume + # the requirement does not hold. Note that many checks are done on different keys + # which is due to different datasets being used with different metadata dicts. + + # size + + if "original_width" not in x_json or "original_height" not in x_json: + return False + + original_width = x_json["original_width"] + original_height = x_json["original_height"] + + is_less_than_min_size = original_width < self.min_size or original_height < self.min_size + + if is_less_than_min_size: + return False + + # watermark + + if ( + "pwatermark" not in x_json + and "watermark_score" not in x_json + and ("stability_metadata" not in x_json or "p_watermarkdf" not in x_json["stability_metadata"]) + ): + return False + + if "pwatermark" in x_json: + is_watermarked = x_json["pwatermark"] > self.max_pwatermark + + if is_watermarked: + return False + + if "watermark_score" in x_json: + is_watermarked_coyo = x_json["watermark_score"] > self.max_pwatermark + + if is_watermarked_coyo: + return False + + if "stability_metadata" in x_json and "p_watermarkdf" in x_json["stability_metadata"]: + is_watermarked_stability_metadata = x_json["stability_metadata"]["p_watermarkdf"] > self.max_pwatermark + + if is_watermarked_stability_metadata: + return False + + # aesthetic + + if ( + "aesthetic" not in x_json + and "AESTHETIC_SCORE" not in x_json + and "aesthetic_score_laion_v2" not in x_json + and ("stability_metadata" not in x_json or "aes_scorelv2" not in x_json["stability_metadata"]) + ): + return False + + if "aesthetic" in x_json: + is_under_min_aesthetic_threshold = x_json["aesthetic"] < self.min_aesthetic_score + + if is_under_min_aesthetic_threshold: + return False + + if "AESTHETIC_SCORE" in x_json: + is_under_min_aesthetic_threshold_b = x_json["AESTHETIC_SCORE"] < self.min_aesthetic_score + + if is_under_min_aesthetic_threshold_b: + return False + + if "aesthetic_score_laion_v2" in x_json: + is_under_min_aesthetic_threshold_coyo = x_json["aesthetic_score_laion_v2"] < self.min_aesthetic_score + + if is_under_min_aesthetic_threshold_coyo: + return False + + if "stability_metadata" in x_json and "aes_scorelv2" in x_json["stability_metadata"]: + is_under_min_aesthetic_threshold_stability_metadata = ( + x_json["stability_metadata"]["aes_scorelv2"] < self.min_aesthetic_score + ) + + if is_under_min_aesthetic_threshold_stability_metadata: + return False + + # spawning + + if self.require_marked_as_ok_by_spawning: + if "stability_metadata" not in x_json or "is_spawning" not in x_json["stability_metadata"]: + return False + + is_marked_as_not_ok_by_spawning = x_json["stability_metadata"]["is_spawning"] + + if is_marked_as_not_ok_by_spawning: + return False + + # getty + + if self.require_marked_as_not_getty: + if "stability_metadata" not in x_json or "is_getty" not in x_json["stability_metadata"]: + return False + + is_marked_as_getty = x_json["stability_metadata"]["is_getty"] + + if is_marked_as_getty: + return False + + # nsfw + + if self.max_pnsfw is not None: + if "stability_metadata" not in x_json or "p_nsfwdf" not in x_json["stability_metadata"]: + return False + + is_above_max_nsfw = x_json["stability_metadata"]["p_nsfwdf"] > self.max_pnsfw + + if is_above_max_nsfw: + return False + + return True + class Text2ImageDataset: def __init__( @@ -307,7 +430,14 @@ def __init__( vae_checkpoint: Optional[str] = None, text_encoder_checkpoint: Optional[str] = None, use_filtered_dataset: bool = False, + require_marked_as_ok_by_spawning: bool = False, + require_marked_as_not_getty: bool = False, + max_pnsfw: Optional[float] = None, ): + if train_shards_path_or_url == "m4_shards": + with open("./configs/m4_shards.yaml") as f: + train_shards_path_or_url = yaml.safe_load(f) + transform = ImageNetTransform(resolution, center_crop, random_flip) def tokenize(text): @@ -360,15 +490,22 @@ def tokenize(text): wds.map(filter_keys(set(["image_input_ids", "encoder_hidden_states"]))), ] + if use_filtered_dataset: + select = wds.select( + WebdatasetSelect( + require_marked_as_ok_by_spawning=require_marked_as_ok_by_spawning, + require_marked_as_not_getty=require_marked_as_not_getty, + max_pnsfw=max_pnsfw, + ) + ) + else: + select = None + # Create train dataset and loader pipeline = [ wds.ResampledShards(train_shards_path_or_url), tarfile_to_samples_nothrow, - wds.select( - WebdatasetFilter(min_size=256, max_pwatermark=0.5, aesthetic_threshold=4.9) - if use_filtered_dataset - else lambda x: True - ), + *([select] if select is not None else []), wds.shuffle(shuffle_buffer_size), *processing_pipeline, wds.batched(per_gpu_batch_size, partial=False, collation_fn=default_collate), diff --git a/training/train_muse.py b/training/train_muse.py index 132ef720..bf8e6ede 100644 --- a/training/train_muse.py +++ b/training/train_muse.py @@ -457,6 +457,9 @@ def save_model_hook(models, weights, output_dir): vae_checkpoint=config.model.vq_model.pretrained, text_encoder_checkpoint=config.model.text_encoder.pretrained, use_filtered_dataset=dataset_config.get("use_filtered_dataset", False), + require_marked_as_ok_by_spawning=dataset_config.get("require_marked_as_ok_by_spawning", False), + require_marked_as_not_getty=dataset_config.get("require_marked_as_not_getty", False), + max_pnsfw=dataset_config.get("max_pnsfw", None), ) train_dataloader, eval_dataloader = dataset.train_dataloader, dataset.eval_dataloader