Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to scrape non-english reviews #28

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions INPUT_SCHEMA.json
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@
"description": "Extract Reviewer name",
"default": false
},
"includeNonEnglishReviews": {
"title": "Include non-English reviews",
"type": "boolean",
"description": "Include non-English reviews",
"default": false
},
"scrapeReviewerUrl": {
"title": "Reviewer URL",
"type": "boolean",
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ When using the scraper on the Apify platform or locally, there are multiple conf
| reviewLimit | number | No | 20 | Minimum number of reviews to scrape. |
| proxy | proxy configuration | Yes | `{ useApifyProxy: true }` | Proxy groups and other proxy related configuration. |
| maxRequestRetries | number | No | 10 | How many times a failed request is retried before thrown away. Requests usually failed when blocked by the target site.
| includeNonEnglishReviews | boolean | No | `false` | By default the scraper will only get reviews in English. You can get reviews in all languages by using this field.

One of `searchTerm` or `directUrls` is required. If none are specified, nothing will be scrapped.

Expand Down
5 changes: 5 additions & 0 deletions src/extractors.js
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,15 @@ const yelpBusinessReviews = ({ url, json, scrapeReviewerName, scrapeReviewerUrl
return [...reviews.values()];
};

const yelpReviewLanguages = (json) => {
return get(json, 'reviewLanguages', [{ code: 'en' }]).map((lng) => lng.code);
};

module.exports = {
yelpSearchResultUrls,
yelpBusinessPartial,
yelpBusinessInfo,
yelpBusinessReviews,
yelpReviewLanguages,
yelpBizPhotos,
};
2 changes: 2 additions & 0 deletions src/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Apify.main(async () => {
proxy,
scrapeReviewerName = false,
scrapeReviewerUrl = false,
includeNonEnglishReviews = false,
} = input;

const proxyConfiguration = await proxyConfigurationValidated({ proxyConfig: proxy });
Expand Down Expand Up @@ -59,6 +60,7 @@ Apify.main(async () => {
failedDataset,
scrapeReviewerName,
scrapeReviewerUrl,
includeNonEnglishReviews,
});
const crawler = new Apify.CheerioCrawler({
requestQueue,
Expand Down
21 changes: 19 additions & 2 deletions src/request-factory.js
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,24 @@ const yelpGraphQl = (url, payload) => {
};
};

const yelpBusinessReview = (bizId, reviewPageStart = undefined, payload = null) => {
const yelpBusinessReviewLanguages = (bizId, payload = null) => {
return {
url: `${BASE_URL}/biz/${bizId}/review_feed?rl=en&sort_by=relevance_desc${reviewPageStart ? `&start=${reviewPageStart}` : ''}`,
url: `${BASE_URL}/biz/${bizId}/review_feed`,
headers: {
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
'X-Requested-With': 'XMLHttpRequest',
'X-Requested-By-React': true,
},
userData: {
label: CATEGORIES.REVIEW_LANGUAGES,
payload,
},
};
};

const yelpBusinessReview = (bizId, lng, reviewPageStart = undefined, payload = null) => {
return {
url: `${BASE_URL}/biz/${bizId}/review_feed?rl=${lng}&sort_by=relevance_desc${reviewPageStart ? `&start=${reviewPageStart}` : ''}`,
headers: {
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
'X-Requested-With': 'XMLHttpRequest',
Expand All @@ -126,6 +141,7 @@ const yelpBusinessReview = (bizId, reviewPageStart = undefined, payload = null)
label: CATEGORIES.REVIEW,
payload: {
...payload,
lng,
reviewPageStart,
},
},
Expand All @@ -136,6 +152,7 @@ module.exports = {
yelpSearch,
yelpSearchTermAndLocation,
yelpBusinessInfo,
yelpBusinessReviewLanguages,
yelpBusinessReview,
yelpGraphQl,
yelpBizPhotos,
Expand Down
62 changes: 50 additions & 12 deletions src/scrapers.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ const { log } = Apify.utils;
* maxImages: number,
* requestQueue: Apify.RequestQueue,
* failedDataset: Apify.Dataset,
* includeNonEnglishReviews: boolean,
* }} params
* @param {(data: any) => Promise<void>} pushResults
* @param {(data: any) => Promise<void>} pushFailedSearch
Expand All @@ -26,6 +27,7 @@ const createYelpPageHandler = ({
failedDataset,
scrapeReviewerName,
scrapeReviewerUrl,
includeNonEnglishReviews,
}) => (
async ({ request, body, $ = null, json = null }) => {
if (request.userData.label === CATEGORIES.SEARCH) {
Expand Down Expand Up @@ -106,11 +108,33 @@ const createYelpPageHandler = ({
const { payload } = request.userData;
const enrichedBusinessInfo = extract.yelpBusinessInfo(json);

const followup = requests.yelpBusinessReview(payload.business.bizId, null, {
...request.userData.payload,
business: nonDestructiveMerge([ request.userData.payload.business, enrichedBusinessInfo ]),
});
let followup;
if (includeNonEnglishReviews) {
followup = requests.yelpBusinessReviewLanguages(payload.business.bizId, {
...request.userData.payload,
business: nonDestructiveMerge([ request.userData.payload.business, enrichedBusinessInfo ]),
});
} else {
followup = requests.yelpBusinessReview(payload.business.bizId, 'en', null, {
...payload,
languages: ['en'],
});
}

await requestQueue.addRequest(followup);
} else if (request.userData.label === CATEGORIES.REVIEW_LANGUAGES) {
const { payload } = request.userData;
const languages = extract.yelpReviewLanguages(json);
log.info(`[REVIEW_LANGUAGES]: Reviews found in the following languages: ${JSON.stringify(languages)}`);

await Promise.all(languages.map((lng) => {
const followup = requests.yelpBusinessReview(payload.business.bizId, lng, null, {
...payload,
languages,
});

return requestQueue.addRequest(followup);
}));
} else if (request.userData.label === CATEGORIES.REVIEW) {
const payload = (request && request.userData && request.userData.payload) || {};
const newReviews = extract.yelpBusinessReviews({
Expand All @@ -135,18 +159,32 @@ const createYelpPageHandler = ({

if (allReviews.length < totalReviewCount && allReviews.length < reviewLimit && newReviews.length > 0) {
// log.info('\tContinuing with next page of reviews...');
await requestQueue.addRequest(requests.yelpBusinessReview(payload.business.bizId, reviewPageStart + newReviews.length,
{
...payload,
scrapedReviews: allReviews,
}));
await requestQueue.addRequest(requests.yelpBusinessReview(payload.business.bizId, payload.lng, reviewPageStart + newReviews.length,{
...payload,
scrapedReviews: allReviews,
}));
} else {
// log.info('\tNo more reviews to scrape, saving what we got');
await Apify.pushData({
const data = await Apify.getValue('OUTPUT');
const output = {
...data,
...request.userData.payload.business,
reviewLanguages: [payload.lng].concat(data?.reviewLanguages || []),
scrapeFinishedAt: new Date().toISOString(),
reviews: allReviews,
});
reviews: allReviews.concat(data?.reviews || []),
};

const allLanguagesCount = request.userData.payload.languages.length;
const currentLanguagesCount = output.reviewLanguages.length;
if (currentLanguagesCount < allLanguagesCount) {
await Apify.setValue('OUTPUT', output);
} else {
output.reviewLanguages = undefined;
await Promise.all([
Apify.pushData(output),
Apify.setValue('OUTPUT', null),
]);
}
}
} else {
request.noRetry = true;
Expand Down
1 change: 1 addition & 0 deletions src/urls.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ const CATEGORIES = {
PHOTOS: 'photos',
GRAPHQL: 'graphql',
REVIEW: 'review',
REVIEW_LANGUAGES: 'review_languages',
UNKNOWN: 'unknown',
};

Expand Down