Skip to content

Commit 3544fff

Browse files
committed
first commit
0 parents  commit 3544fff

15 files changed

+8910
-0
lines changed

.actor/Dockerfile

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Specify the base Docker image. You can read more about
2+
# the available images at https://crawlee.dev/docs/guides/docker-images
3+
# You can also use any other image from Docker Hub.
4+
FROM apify/actor-node:20 AS builder
5+
6+
# Check preinstalled packages
7+
RUN npm ls crawlee apify puppeteer playwright
8+
9+
# Copy just package.json and package-lock.json
10+
# to speed up the build using Docker layer cache.
11+
COPY package*.json ./
12+
13+
# Install all dependencies. Don't audit to speed up the installation.
14+
RUN npm install --include=dev --audit=false
15+
16+
# Next, copy the source files using the user set
17+
# in the base image.
18+
COPY . ./
19+
20+
# Install all dependencies and build the project.
21+
# Don't audit to speed up the installation.
22+
RUN npm run build
23+
24+
# Create final image
25+
FROM apify/actor-node:20
26+
27+
# Check preinstalled packages
28+
RUN npm ls crawlee apify puppeteer playwright
29+
30+
# Copy just package.json and package-lock.json
31+
# to speed up the build using Docker layer cache.
32+
COPY package*.json ./
33+
34+
# Install NPM packages, skip optional and development dependencies to
35+
# keep the image small. Avoid logging too much and print the dependency
36+
# tree for debugging
37+
RUN npm --quiet set progress=false \
38+
&& npm install --omit=dev --omit=optional \
39+
&& echo "Installed NPM packages:" \
40+
&& (npm list --omit=dev --all || true) \
41+
&& echo "Node.js version:" \
42+
&& node --version \
43+
&& echo "NPM version:" \
44+
&& npm --version \
45+
&& rm -r ~/.npm
46+
47+
# Copy built JS files from builder image
48+
COPY --from=builder /usr/src/app/dist ./dist
49+
50+
# Next, copy the remaining files and directories with the source code.
51+
# Since we do this after NPM install, quick build will be really fast
52+
# for most source file changes.
53+
COPY . ./
54+
55+
56+
# Run the image.
57+
CMD npm run start:prod --silent

.actor/actor.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"actorSpecification": 1,
3+
"name": "store-dynamic-web-scraper",
4+
"title": "Dynamic Web Scraper",
5+
"description": "Dynamic crawling the websites based on the website content",
6+
"version": "0.0",
7+
"input": "./input_schema.json",
8+
"dockerfile": "./Dockerfile"
9+
}

.actor/input_schema.json

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"title": "Dynamic Web Scraper",
3+
"type": "object",
4+
"schemaVersion": 1,
5+
"properties": {
6+
"startSources": {
7+
"title": "Start Sources",
8+
"type": "array",
9+
"description": "Sources to start with, could be .",
10+
"editor": "stringList",
11+
"prefill": ["Lucerna Bar Cafe, Prague"],
12+
"minItems": 1
13+
},
14+
"prompt": {
15+
"title": "Prompt",
16+
"type": "string",
17+
"editor": "textarea",
18+
"description": "Prompt to evaluate on content.",
19+
"prefill": "Look for the food menu items and if you find it look for a chicken option"
20+
},
21+
"maxDepth": {
22+
"title": "Max Depth",
23+
"type": "integer",
24+
"description": "Maximum depth to scrape.",
25+
"default": 3
26+
}
27+
},
28+
"required": [
29+
"startSources",
30+
"prompt"
31+
]
32+
}

.dockerignore

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# configurations
2+
.idea
3+
.vscode
4+
5+
# crawlee and apify storage folders
6+
apify_storage
7+
crawlee_storage
8+
storage
9+
10+
# installed files
11+
node_modules
12+
13+
# git folder
14+
.git

.editorconfig

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
root = true
2+
3+
[*]
4+
indent_style = space
5+
indent_size = 4
6+
charset = utf-8
7+
trim_trailing_whitespace = true
8+
insert_final_newline = true
9+
end_of_line = lf

.eslintrc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"root": true,
3+
"env": {
4+
"browser": true,
5+
"es2020": true,
6+
"node": true
7+
},
8+
"extends": [
9+
"@apify/eslint-config-ts"
10+
],
11+
"parserOptions": {
12+
"project": "./tsconfig.json",
13+
"ecmaVersion": 2020
14+
},
15+
"ignorePatterns": [
16+
"node_modules",
17+
"dist",
18+
"**/*.d.ts"
19+
]
20+
}

.gitignore

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# This file tells Git which files shouldn't be added to source control
2+
3+
.DS_Store
4+
.idea
5+
.vscode
6+
dist
7+
node_modules
8+
apify_storage
9+
storage
10+
11+
# Added by Apify CLI
12+
.venv

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
## What does Dynamic Web Scraper do?
2+
Dynamic Web Scraper is an Apify Actor that gathers information online by simulating user browsing behavior on the web. It reduces the time and amount of scraped web pages by using a model (ChatGPT) to make decisions regarding browser navigation and results evaluation.
3+
4+
Dynamic Web Scraper takes as input a prompt and a list of initial URLs or Google queries. The model then decides which URLs should be visited in order to provide the best answer to the prompt.
5+
6+
## Why scrape websites dynamically?
7+
Dynamic web scraping allows for more efficient data extraction by automating the browsing process. It can help gather specific information from websites that may not be easily accessible through traditional scraping methods.
8+
9+
## How to use Dynamic Web Scraper
10+
To use Dynamic Web Scraper, follow these steps:
11+
1. Click on Try for free.
12+
2. Enter the prompt and a list of initial URLs or Google queries.
13+
3. Click on Run.
14+
4. Once the actor has finished, preview or download your data from the Dataset tab.
15+
16+
## How much will it cost to use Dynamic Web Scraper?
17+
Apify provides $5 free usage credits every month on the Apify Free plan. For more extensive data extraction needs, consider upgrading to a paid Apify subscription.
18+
19+
## Results\nAn example of the JSON results produced by the actor:
20+
```
21+
{
22+
"url": "https://www.example.com",
23+
"inputSource": "Example input",
24+
"depth": 3,
25+
"response": <Response based on input>
26+
}
27+
```
28+
29+
## Tips for using Dynamic Web Scraper
30+
- Ensure that your prompt is clear and specific to get accurate results.
31+
- Monitor the actor's progress to ensure it is navigating the web effectively.
32+
33+
## Is it legal to use Dynamic Web Scraper?
34+
It is important to be aware of legal considerations when scraping websites, especially regarding data privacy regulations such as GDPR. Ensure that you have a legitimate reason for scraping and consult legal advice if needed.
35+
For more information on the legality of web scraping, read our blog post: [Is Web Scraping Legal?](https://blog.apify.com/is-web-scraping-legal/)

0 commit comments

Comments
 (0)