From d1eb70d280f740b268cf899fb873c8bdc7e28840 Mon Sep 17 00:00:00 2001 From: Bilyal Mestanov Date: Fri, 30 Dec 2022 18:15:32 +0200 Subject: [PATCH 1/4] Implement scraping of non-english reviews --- INPUT_SCHEMA.json | 6 ++++ src/extractors.js | 5 ++++ src/request-factory.js | 21 ++++++++++++-- src/scrapers.js | 62 ++++++++++++++++++++++++++++++++++-------- src/urls.js | 1 + 5 files changed, 81 insertions(+), 14 deletions(-) diff --git a/INPUT_SCHEMA.json b/INPUT_SCHEMA.json index 711644a..4664630 100644 --- a/INPUT_SCHEMA.json +++ b/INPUT_SCHEMA.json @@ -64,6 +64,12 @@ "description": "Extract Reviewer name", "default": false }, + "includeNonEnglishReviews": { + "title": "Include non-English reviews", + "type": "boolean", + "description": "Include non-English reviews", + "default": false + }, "scrapeReviewerUrl": { "title": "Reviewer URL", "type": "boolean", diff --git a/src/extractors.js b/src/extractors.js index 68cea40..c02831c 100644 --- a/src/extractors.js +++ b/src/extractors.js @@ -151,10 +151,15 @@ const yelpBusinessReviews = ({ url, json, scrapeReviewerName, scrapeReviewerUrl return [...reviews.values()]; }; +const yelpReviewLanguages = (json) => { + return get(json, 'reviewLanguages', [{ code: 'en' }]).map((lng) => lng.code); +}; + module.exports = { yelpSearchResultUrls, yelpBusinessPartial, yelpBusinessInfo, yelpBusinessReviews, + yelpReviewLanguages, yelpBizPhotos, }; diff --git a/src/request-factory.js b/src/request-factory.js index 9e879ac..187afed 100644 --- a/src/request-factory.js +++ b/src/request-factory.js @@ -114,9 +114,24 @@ const yelpGraphQl = (url, payload) => { }; }; -const yelpBusinessReview = (bizId, reviewPageStart = undefined, payload = null) => { +const yelpBusinessReviewLanguages = (bizId, payload = null) => { return { - url: `${BASE_URL}/biz/${bizId}/review_feed?rl=en&sort_by=relevance_desc${reviewPageStart ? `&start=${reviewPageStart}` : ''}`, + url: `${BASE_URL}/biz/${bizId}/review_feed`, + headers: { + 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', + 'X-Requested-With': 'XMLHttpRequest', + 'X-Requested-By-React': true, + }, + userData: { + label: CATEGORIES.REVIEW_LANGUAGES, + payload, + }, + }; +}; + +const yelpBusinessReview = (bizId, lng, reviewPageStart = undefined, payload = null) => { + return { + url: `${BASE_URL}/biz/${bizId}/review_feed?rl=${lng}&sort_by=relevance_desc${reviewPageStart ? `&start=${reviewPageStart}` : ''}`, headers: { 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', 'X-Requested-With': 'XMLHttpRequest', @@ -126,6 +141,7 @@ const yelpBusinessReview = (bizId, reviewPageStart = undefined, payload = null) label: CATEGORIES.REVIEW, payload: { ...payload, + lng, reviewPageStart, }, }, @@ -136,6 +152,7 @@ module.exports = { yelpSearch, yelpSearchTermAndLocation, yelpBusinessInfo, + yelpBusinessReviewLanguages, yelpBusinessReview, yelpGraphQl, yelpBizPhotos, diff --git a/src/scrapers.js b/src/scrapers.js index 09be308..73f5894 100644 --- a/src/scrapers.js +++ b/src/scrapers.js @@ -13,6 +13,7 @@ const { log } = Apify.utils; * maxImages: number, * requestQueue: Apify.RequestQueue, * failedDataset: Apify.Dataset, + * includeNonEnglishReviews: boolean, * }} params * @param {(data: any) => Promise} pushResults * @param {(data: any) => Promise} pushFailedSearch @@ -26,6 +27,7 @@ const createYelpPageHandler = ({ failedDataset, scrapeReviewerName, scrapeReviewerUrl, + includeNonEnglishReviews, }) => ( async ({ request, body, $ = null, json = null }) => { if (request.userData.label === CATEGORIES.SEARCH) { @@ -106,11 +108,33 @@ const createYelpPageHandler = ({ const { payload } = request.userData; const enrichedBusinessInfo = extract.yelpBusinessInfo(json); - const followup = requests.yelpBusinessReview(payload.business.bizId, null, { - ...request.userData.payload, - business: nonDestructiveMerge([ request.userData.payload.business, enrichedBusinessInfo ]), - }); + let followup; + if (includeNonEnglishReviews) { + followup = requests.yelpBusinessReviewLanguages(payload.business.bizId, { + ...request.userData.payload, + business: nonDestructiveMerge([ request.userData.payload.business, enrichedBusinessInfo ]), + }); + } else { + followup = requests.yelpBusinessReview(payload.business.bizId, 'en', null, { + ...payload, + languages: ['en'], + }); + } + await requestQueue.addRequest(followup); + } else if (request.userData.label === CATEGORIES.REVIEW_LANGUAGES) { + const { payload } = request.userData; + const languages = extract.yelpReviewLanguages(json); + log.info(`[REVIEW_LANGUAGES]: Reviews found in the following languages: ${JSON.stringify(languages)}`); + + await Promise.all(languages.map((lng) => { + const followup = requests.yelpBusinessReview(payload.business.bizId, lng, null, { + ...payload, + languages, + }); + + return requestQueue.addRequest(followup); + })); } else if (request.userData.label === CATEGORIES.REVIEW) { const payload = (request && request.userData && request.userData.payload) || {}; const newReviews = extract.yelpBusinessReviews({ @@ -135,18 +159,32 @@ const createYelpPageHandler = ({ if (allReviews.length < totalReviewCount && allReviews.length < reviewLimit && newReviews.length > 0) { // log.info('\tContinuing with next page of reviews...'); - await requestQueue.addRequest(requests.yelpBusinessReview(payload.business.bizId, reviewPageStart + newReviews.length, - { - ...payload, - scrapedReviews: allReviews, - })); + await requestQueue.addRequest(requests.yelpBusinessReview(payload.business.bizId, payload.lng, reviewPageStart + newReviews.length,{ + ...payload, + scrapedReviews: allReviews, + })); } else { // log.info('\tNo more reviews to scrape, saving what we got'); - await Apify.pushData({ + const data = await Apify.getValue('OUTPUT'); + const output = { + ...data, ...request.userData.payload.business, + reviewLanguages: [payload.lng].concat(data?.reviewLanguages || []), scrapeFinishedAt: new Date().toISOString(), - reviews: allReviews, - }); + reviews: allReviews.concat(data?.allReviews || []), + }; + + const allLanguagesCount = request.userData.payload.languages.length; + const currentLanguagesCount = output.reviewLanguages.length; + if (currentLanguagesCount < allLanguagesCount) { + await Apify.setValue('OUTPUT', output); + } else { + output.reviewLanguages = undefined; + await Promise.all([ + Apify.pushData(output), + Apify.setValue('OUTPUT', null), + ]); + } } } else { request.noRetry = true; diff --git a/src/urls.js b/src/urls.js index 2e1b21a..107d8f2 100644 --- a/src/urls.js +++ b/src/urls.js @@ -20,6 +20,7 @@ const CATEGORIES = { PHOTOS: 'photos', GRAPHQL: 'graphql', REVIEW: 'review', + REVIEW_LANGUAGES: 'review_languages', UNKNOWN: 'unknown', }; From 44a4665d1725d85b2a8fbfc21a5f476acf2a89c6 Mon Sep 17 00:00:00 2001 From: Bilyal Mestanov Date: Fri, 30 Dec 2022 18:20:27 +0200 Subject: [PATCH 2/4] Update README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 8e6e5b4..d82754b 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ When using the scraper on the Apify platform or locally, there are multiple conf | reviewLimit | number | No | 20 | Minimum number of reviews to scrape. | | proxy | proxy configuration | Yes | `{ useApifyProxy: true }` | Proxy groups and other proxy related configuration. | | maxRequestRetries | number | No | 10 | How many times a failed request is retried before thrown away. Requests usually failed when blocked by the target site. +| includeNonEnglishReviews | boolean | No | `false` | By default the scraper will only get reviews in English. You can get reviews in all languages by using this field. One of `searchTerm` or `directUrls` is required. If none are specified, nothing will be scrapped. From fdf87dfdd1e0a3e2f8084aa1594bf770974c8f29 Mon Sep 17 00:00:00 2001 From: Bilyal Mestanov Date: Fri, 30 Dec 2022 18:23:47 +0200 Subject: [PATCH 3/4] Pass input to page handler --- src/main.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main.js b/src/main.js index 565ec7d..31eacb9 100644 --- a/src/main.js +++ b/src/main.js @@ -22,6 +22,7 @@ Apify.main(async () => { proxy, scrapeReviewerName = false, scrapeReviewerUrl = false, + includeNonEnglishReviews = false, } = input; const proxyConfiguration = await proxyConfigurationValidated({ proxyConfig: proxy }); @@ -59,6 +60,7 @@ Apify.main(async () => { failedDataset, scrapeReviewerName, scrapeReviewerUrl, + includeNonEnglishReviews, }); const crawler = new Apify.CheerioCrawler({ requestQueue, From beca1cebdafd718a2a94e23bdd25febc5ae66128 Mon Sep 17 00:00:00 2001 From: Bilyal Mestanov Date: Fri, 30 Dec 2022 18:58:19 +0200 Subject: [PATCH 4/4] Fix temp kv store access --- src/scrapers.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scrapers.js b/src/scrapers.js index 73f5894..ffad0f8 100644 --- a/src/scrapers.js +++ b/src/scrapers.js @@ -171,7 +171,7 @@ const createYelpPageHandler = ({ ...request.userData.payload.business, reviewLanguages: [payload.lng].concat(data?.reviewLanguages || []), scrapeFinishedAt: new Date().toISOString(), - reviews: allReviews.concat(data?.allReviews || []), + reviews: allReviews.concat(data?.reviews || []), }; const allLanguagesCount = request.userData.payload.languages.length;