|
8 | 8 |
|
9 | 9 | ARTICLE_TOKEN = "Article: "
|
10 | 10 | HEADER_TOKEN = "Subject: "
|
11 |
| -URLS = [ |
12 |
| - "https://www.iguazio.com/blog/iguazio-releases-data-science-platform-version-2-8/", |
13 |
| - "https://www.iguazio.com/blog/intelligent-edge-iguazio-google/", |
14 |
| - "https://www.iguazio.com/blog/top-9-odsc-europe-sessions-you-cant-miss/", |
15 |
| - "https://www.iguazio.com/blog/cloud-native-will-shake-up-enterprise-storage/", |
16 |
| - "https://www.iguazio.com/blog/building-an-automated-ml-pipeline-with-a-feature-store-using-iguazio-snowflake/", |
17 |
| - "https://www.iguazio.com/blog/concept-drift-and-the-impact-of-covid-19-on-data-science/", |
18 |
| - "https://www.iguazio.com/blog/odsc-east-boston-2022-top-11-sessions-for-ai-and-ml-professionals-to-attend/", |
19 |
| - "https://www.iguazio.com/blog/idc-mlopmarketscape-2022/", |
20 |
| - "https://www.iguazio.com/blog/iguazio-listed-in-7-gartner-hype-cycles-for-2021/", |
21 |
| - "https://www.iguazio.com/blog/announcing-the-winners-mlops-for-good-hackathon/", |
22 |
| - "https://www.iguazio.com/blog/the-importance-of-data-storytelling-in-shaping-a-data-science-product/", |
23 |
| - "https://www.iguazio.com/blog/modernize-it-infrastructure/", |
24 |
| - "https://www.iguazio.com/blog/implementing-automation-and-an-mlops-framework-for-enterprise-scale-ml/", |
25 |
| - "https://www.iguazio.com/blog/automating-ml-pipelines-on-azure-and-azure-stack/", |
26 |
| - "https://www.iguazio.com/blog/real-time-streaming-for-data-science/", |
27 |
| - "https://www.iguazio.com/blog/dcos-apps/", |
28 |
| - "https://www.iguazio.com/blog/iguazio-receives-an-honorable-mention-in" |
29 |
| - "-the-2021-gartner-magic-quadrant-for-data-science-and-machine-learning-platforms/", |
30 |
| - "https://www.iguazio.com/blog/gartner-2022-market-guide-for-dsml-engineering-platforms/", |
31 |
| - "https://www.iguazio.com/blog/can-open-source-serverless-be-simpler-than-lambda/", |
32 |
| - "https://www.iguazio.com/blog/cncf-webinar-serverless-ai/", |
33 |
| - "https://www.iguazio.com/blog/2018-can-cloud-big-data-ai-stand-turmoil/", |
34 |
| - "https://www.iguazio.com/blog/2022-predictions/", |
35 |
| - "https://www.iguazio.com/blog/mlops-for-python/", |
36 |
| - "https://www.iguazio.com/blog/mlops-predictions-for-2023/", |
37 |
| - "https://www.iguazio.com/blog/adopting-a-production-first-approach-to-enterprise-ai/", |
38 |
| - "https://www.iguazio.com/blog/from-automl-to-automlops/", |
39 |
| - "https://www.iguazio.com/blog/odscwest2021/", |
40 |
| - "https://www.iguazio.com/blog/top-10-recommended-mlops-world-2021-sessions/", |
41 |
| - "https://www.iguazio.com/blog/" |
42 |
| - "breaking-the-silos-between-data-scientists-engineers-and-devops-with-new-mlops-practices/", |
43 |
| - "https://www.iguazio.com/blog/top-8-machine-learning-resources-for-data-scientists-data-engineers-and-everyone/", |
44 |
| - "https://www.iguazio.com/blog/azure-synapse-analytics-and-iguazio/", |
45 |
| - "https://www.iguazio.com/blog/" |
46 |
| - "how-to-tap-into-higher-level-abstraction-efficiency-automation-to-simplify-your-ai-ml-journey/", |
47 |
| - "https://www.iguazio.com/blog/how-seagate-runs-advanced-manufacturing-at-scale-with-iguazio/", |
48 |
| - "https://www.iguazio.com/blog/predictive-real-time-operational-ml-pipeline-fighting-customer-churn/", |
49 |
| - "https://www.iguazio.com/blog/build-an-ai-app-in-under-20-minutes/", |
50 |
| - "https://www.iguazio.com/blog/deploying-machine-learning-models-for-real-time-predictions-checklist/", |
51 |
| - "https://www.iguazio.com/blog/data-science-post-hadoop/", |
52 |
| - "https://www.iguazio.com/blog/wanted-a-faster-storage-stack/", |
53 |
| - "https://www.iguazio.com/blog/kubernetes-the-open-scalable-approach-to-ml-pipelines/", |
54 |
| - "https://www.iguazio.com/blog/vmware-on-aws-a-scorecard-for-winners-and-losers/", |
55 |
| - "https://www.iguazio.com/blog/aws-reinvent-data-serverless-ai/", |
56 |
| - "https://www.iguazio.com/blog/beyond-hyped-iguazio-named-in-8-gartner-hype-cycles-for-2022/", |
57 |
| - "https://www.iguazio.com/blog/ai-ml-and-roi-why-your-balance-sheet-cares-about-your-technology-choices/", |
58 |
| - "https://www.iguazio.com/blog/orchestrating-ml-pipelines-scale-kubeflow/", |
59 |
| - "https://www.iguazio.com/blog/using-automated-model-management-for-cpg-trade-success/", |
60 |
| - "https://www.iguazio.com/blog/spark-over-kubernetes/", |
61 |
| - "https://www.iguazio.com/blog/announcing-iguazio-version-3-0-breaking-the-silos-for-faster-deployment/", |
62 |
| - "https://www.iguazio.com/blog/the-complete-guide-to-using-the-iguazio-feature-store-with-azure-ml-part-4/", |
63 |
| - "https://www.iguazio.com/blog/accelerating-ml-deployment-in-hybrid-environments/", |
64 |
| - "https://www.iguazio.com/blog/it-worked-fine-in-jupyter-now-what/", |
65 |
| - "https://www.iguazio.com/blog/kubeflow-vs-mlflow-vs-mlrun/", |
66 |
| - "https://www.iguazio.com/blog/part-one-the-complete-guide-to-using-the-iguazio-feature-store-with-azure-ml/", |
67 |
| - "https://www.iguazio.com/blog/handling-large-datasets-with-mlops-dask-on-kubernetes/", |
68 |
| - "https://www.iguazio.com/blog/faster-ai-development-serverless/", |
69 |
| - "https://www.iguazio.com/blog/nuclio-future-serverless-computing/", |
70 |
| - "https://www.iguazio.com/blog/how-to-build-real-time-feature-engineering-with-a-feature-store/", |
71 |
| - "https://www.iguazio.com/blog/nyc-meetup-jan2018/", |
72 |
| - "https://www.iguazio.com/blog/distributed-feature-store-ingestion-with-iguazio-snowflake-and-spark/", |
73 |
| - "https://www.iguazio.com/blog/iguazio-raises-33m-accelerate-digital-transformation/", |
74 |
| - "https://www.iguazio.com/blog/the-complete-guide-to-using-the-iguazio-feature-store-with-azure-ml-part-2/", |
75 |
| - "https://www.iguazio.com/blog/serverless-can-it-simplify-data-science-projects/", |
76 |
| - "https://www.iguazio.com/blog/machine-learning-hard/", |
77 |
| - "https://www.iguazio.com/blog/free-manufacturing-datasets/", |
78 |
| - "https://www.iguazio.com/blog/building-real-time-ml-pipelines-with-a-feature-store/", |
79 |
| - "https://www.iguazio.com/blog/paving-the-data-science-dirt-road/", |
80 |
| - "https://www.iguazio.com/blog/horovod-for-deep-learning-on-a-gpu-cluster/", |
81 |
| - "https://www.iguazio.com/blog/using-containers-as-mini-vms-is-not-cloud-native/", |
82 |
| - "https://www.iguazio.com/blog/top-9-recommended-odsc-europe-2021-sessions/", |
83 |
| - "https://www.iguazio.com/blog/realtime-bigdata/", |
84 |
| - "https://www.iguazio.com/blog/python-pandas-performance/", |
85 |
| - "https://www.iguazio.com/blog/iguazio-rvmworld-2017-vmware-feeds-off-openstack-decay/", |
86 |
| - "https://www.iguazio.com/blog/how-gpuaas-on-kubeflow-can-boost-your-productivity/", |
87 |
| - "https://www.iguazio.com/blog/mlops-nyc-sessions/", |
88 |
| - "https://www.iguazio.com/blog/2017-predictions-clouds-thunder-and-fog/", |
89 |
| - "https://www.iguazio.com/blog/odsc-east-2023/", |
90 |
| - "https://www.iguazio.com/blog/join-us-at-nvidia-gtc-2021/", |
91 |
| - "https://www.iguazio.com/blog/mckinsey-acquires-iguazio-our-startups-journey/", |
92 |
| - "https://www.iguazio.com/blog/git-based-ci-cd-for-machine-learning-mlops/", |
93 |
| - "https://www.iguazio.com/blog/mlops-for-good-hackathon-roundup/", |
94 |
| - "https://www.iguazio.com/blog/big-data-must-begin-with-clean-slate/", |
95 |
| - "https://www.iguazio.com/blog/suse-iguazio/", |
96 |
| - "https://www.iguazio.com/blog/how-to-run-workloads-on-spark-operator-with-dynamic-allocation-using-mlrun/", |
97 |
| - "https://www.iguazio.com/blog/will-kubernetes-sink-the-hadoop-ship/", |
98 |
| - "https://www.iguazio.com/blog/5-incredible-data-science-solutions-for-real-world-problems/", |
99 |
| - "https://www.iguazio.com/blog/mlops-challenges-solutions-future-trends/", |
100 |
| - "https://www.iguazio.com/blog/cloud-data-services-sprawl-its-complicated/", |
101 |
| - "https://www.iguazio.com/blog/predicting-1st-day-churn-in-real-time/", |
102 |
| - "https://www.iguazio.com/blog/machine-learning-experiment-tracking-from-zero-to-hero-in-2-lines-of-code/", |
103 |
| - "https://www.iguazio.com/blog/how-to-bring-breakthrough-performance-and-productivity-to-ai-ml-projects/", |
104 |
| - "https://www.iguazio.com/blog/how-to-deploy-an-mlrun-project-in-a-ci-cd-process-with-jenkins-pipeline/", |
105 |
| - "https://www.iguazio.com/blog/iguazio-named-in-forresters-now-tech-ai-ml-platforms-q1-2022/", |
106 |
| - "https://www.iguazio.com/blog/the-complete-guide-to-using-the-iguazio-feature-store-with-azure-ml-part-3/", |
107 |
| - "https://www.iguazio.com/blog/what-are-feature-stores-and-why-are-they-critical-for-scaling-data-science/", |
108 |
| - "https://www.iguazio.com/blog/reinventing-data-services/", |
109 |
| - "https://www.iguazio.com/blog/re-structure-in-big-data/", |
110 |
| - "https://www.iguazio.com/blog/top-22-free-healthcare-datasets-for-machine-learning/", |
111 |
| - "https://www.iguazio.com/blog/operationalizing-machine-learning-for-the-automotive-future/", |
112 |
| - "https://www.iguazio.com/blog/automating-mlops-for-deep-learning-how-to-operationalize-dl-with-minimal-effort/", |
113 |
| - "https://www.iguazio.com/blog/iguazio-named-a-fast-moving-leader-by-gigaom-in-the-radar-for-mlops-report/", |
114 |
| - "https://www.iguazio.com/blog/" |
115 |
| - "data-science-salon-review-elevating-data-science-practices-for-media-entertainment-advertising/", |
116 |
| - "https://www.iguazio.com/blog/wrapping-up-serverless-nyc-2018/", |
117 |
| - "https://www.iguazio.com/blog/the-next-gen-digital-transformation-cloud-native-data-platforms/", |
118 |
| - "https://www.iguazio.com/blog/best-practices-for-succeeding-with-mlops/", |
119 |
| - "https://www.iguazio.com/blog/did-amazon-just-kill-open-source/", |
120 |
| - "https://www.iguazio.com/blog/cloud-native-storage-primer/", |
121 |
| - "https://www.iguazio.com/blog/serverless-background-challenges-and-future/", |
122 |
| - "https://www.iguazio.com/blog/experiment-tracking/", |
123 |
| - "https://www.iguazio.com/blog/continuous-analytics-real-time-meets-cloud-native/", |
124 |
| - "https://www.iguazio.com/blog/concept-drift-deep-dive-how-to-build-a-drift-aware-ml-system/", |
125 |
| - "https://www.iguazio.com/blog/building-ml-pipelines-over-federated-data-compute-environments/", |
126 |
| - "https://www.iguazio.com/blog/top-8-recommended-mlops-world-2022-sessions/", |
127 |
| - "https://www.iguazio.com/blog/it-vendors-dont-stand-a-chance-against-the-cloud/", |
128 |
| - "https://www.iguazio.com/blog/ml-workflows-what-can-you-automate/", |
129 |
| - "https://www.iguazio.com/blog/iguazio-collaborates-with-equinix-to-offer-data-centric-hybrid-cloud-solutions/", |
130 |
| - "https://www.iguazio.com/blog/gigaom-names-iguazio-a-leader-and-outperformer-for-2022/", |
131 |
| - "https://www.iguazio.com/blog/iguazio-nvidia-edge/", |
132 |
| - "https://www.iguazio.com/blog/extending-kubeflow-into-an-end-to-end-ml-solution/", |
133 |
| - "https://www.iguazio.com/blog/iguazio-listed-in-five-2020-gartner-hype-cycle-reports/", |
134 |
| - "https://www.iguazio.com/blog/data-science-trends-2020/", |
135 |
| - "https://www.iguazio.com/blog/operationalizing-data-science/", |
136 |
| - "https://www.iguazio.com/blog/using-snowflake-and-dask-for-large-scale-ml-workloads/", |
137 |
| - "https://www.iguazio.com/blog/best-13-free-financial-datasets-for-machine-learning/", |
138 |
| - "https://www.iguazio.com/blog/introduction-to-tf-serving/", |
139 |
| - "https://www.iguazio.com/blog/hcis-journey-to-mlops-efficiency/", |
140 |
| - "https://www.iguazio.com/blog/streamlined-iot-at-scale-with-iguazio/", |
141 |
| - "https://www.iguazio.com/blog/iguazio-product-update-optimize-your-ml-workload-costs-with-aws-ec2-spot-instances/", |
142 |
| - "https://www.iguazio.com/blog/top-10-odsc-west-sessions-you-must-attend/", |
143 |
| - "https://www.iguazio.com/blog/iguazio-named-a-leader-and-outperformer-in-gigaom-radar-for-mlops-2022/", |
144 |
| - "https://www.iguazio.com/blog/deploying-your-hugging-face-models-to-production-at-scale-with-mlrun/", |
145 |
| -] |
146 | 11 |
|
147 | 12 |
|
148 | 13 | def normalize(s: str) -> str:
|
@@ -193,20 +58,22 @@ def get_html_as_string(url: str, mark_headers: bool) -> str:
|
193 | 58 |
|
194 | 59 |
|
195 | 60 | @mlrun.handler(outputs=["html-as-text-files:directory"])
|
196 |
| -def collect_html_to_text_files(urls, mark_headers=True) -> str: |
| 61 | +def collect_html_to_text_files(urls_file: str, mark_headers=True) -> str: |
197 | 62 | """
|
198 | 63 | Retrieve all html text content from URLs as text files.
|
199 | 64 |
|
200 |
| - :param urls: html URLs |
| 65 | + :param urls_file: html URLs file |
201 | 66 | :param mark_headers: Whether to add article and header prefixes to headers to text
|
202 | 67 |
|
203 | 68 | :returns: the directory name that contains all the content text files.
|
204 | 69 | """
|
205 | 70 | directory = "html_as_text_files"
|
206 | 71 | os.makedirs(directory, exist_ok=True)
|
207 | 72 | # Writing html files as text files:
|
208 |
| - urls = URLS |
| 73 | + with open(urls_file, "r") as f: |
| 74 | + urls = f.readlines() |
209 | 75 | for url in urls:
|
| 76 | + url = url.replace("\n", "") |
210 | 77 | page_name = Path(url).name
|
211 | 78 | with open(f"{directory}/{page_name}.txt", "w") as f:
|
212 | 79 | f.write(get_html_as_string(url, mark_headers))
|
|
0 commit comments