Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Discuss] Wrapper longer response times caused by some overhead/additional processing #50

Open
dimitryzub opened this issue May 23, 2023 · 0 comments

Comments

@dimitryzub
Copy link
Contributor

@jvmvik this issue is for discussion.

I'm not 100% sure what the cause is, but there's might be some overhead or additional processing in the wrapper that causes longer response times. Or it is as it should be? Let me know if it's the case.

Table shows results when making 50 requests:

Making direct requests to serpapi.com/search.json Making a request to serpapi.com through API wrapper Making a request with async batch requests with Queue
~7.192448616027832 seconds ~135.2969319820404 seconds ~24.80349826812744 seconds

Making a direct request to serpapi.com/search.json:

import aiohttp
import asyncio
import os
import json
import time

async def fetch_results(session, query):
    params = {
        'api_key': '...',
        'engine': 'youtube',
        'device': 'desktop',
        'search_query': query,
        'no_cache': 'true'
    }
    
    url = 'https://serpapi.com/search.json'
    async with session.get(url, params=params) as response:
        results = await response.json()

    data = []

    if 'error' in results:
        print(results['error'])
    else:
        for result in results.get('video_results', []):
            data.append({
                'title': result.get('title'),
                'link': result.get('link'),
                'channel': result.get('channel').get('name'),
            })

    return data

async def main():
    # 50 queries
    queries = [
        'burly',
        'creator',
        'doubtful',
        'chance',
        'capable',
        'window',
        'dynamic',
        'train',
        'worry',
        'useless',
        'steady',
        'thoughtful',
        'matter',
        'rotten',
        'overflow',
        'object',
        'far-flung',
        'gabby',
        'tiresome',
        'scatter',
        'exclusive',
        'wealth',
        'yummy',
        'play',
        'saw',
        'spiteful',
        'perform',
        'busy',
        'hypnotic',
        'sniff',
        'early',
        'mindless',
        'airplane',
        'distribution',
        'ahead',
        'good',
        'squeeze',
        'ship',
        'excuse',
        'chubby',
        'smiling',
        'wide',
        'structure',
        'wrap',
        'point',
        'file',
        'sack',
        'slope',
        'therapeutic',
        'disturbed'
    ]

    data = []

    async with aiohttp.ClientSession() as session:
        tasks = []
        for query in queries:
            task = asyncio.ensure_future(fetch_results(session, query))
            tasks.append(task)

        start_time = time.time()
        results = await asyncio.gather(*tasks)
        end_time = time.time()

        data = [item for sublist in results for item in sublist]

    print(json.dumps(data, indent=2, ensure_ascii=False))
    print(f'Script execution time: {end_time - start_time} seconds') # ~7.192448616027832 seconds

asyncio.run(main())

Same code but using the wrapper YoutubeSearch (not 100% sure if valid comparison):

import aiohttp
import asyncio
from serpapi import YoutubeSearch
import os
import json
import time

async def fetch_results(session, query):
    params = {
        'api_key': '...',
        'engine': 'youtube',
        'device': 'desktop',
        'search_query': query,
        'no_cache': 'true'
    }
    search = YoutubeSearch(params)
    results = search.get_json()

    data = []

    if 'error' in results:
        print(results['error'])
    else:
        for result in results.get('video_results', []):
            data.append({
                'title': result.get('title'),
                'link': result.get('link'),
                'channel': result.get('channel').get('name'),
            })

    return data

async def main():
    queries = [
        'burly',
        'creator',
        'doubtful',
        'chance',
        'capable',
        'window',
        'dynamic',
        'train',
        'worry',
        'useless',
        'steady',
        'thoughtful',
        'matter',
        'rotten',
        'overflow',
        'object',
        'far-flung',
        'gabby',
        'tiresome',
        'scatter',
        'exclusive',
        'wealth',
        'yummy',
        'play',
        'saw',
        'spiteful',
        'perform',
        'busy',
        'hypnotic',
        'sniff',
        'early',
        'mindless',
        'airplane',
        'distribution',
        'ahead',
        'good',
        'squeeze',
        'ship',
        'excuse',
        'chubby',
        'smiling',
        'wide',
        'structure',
        'wrap',
        'point',
        'file',
        'sack',
        'slope',
        'therapeutic',
        'disturbed'
    ]

    data = []

    async with aiohttp.ClientSession() as session:
        tasks = []
        for query in queries:
            task = asyncio.ensure_future(fetch_results(session, query))
            tasks.append(task)
        
        start_time = time.time()
        results = await asyncio.gather(*tasks)
        end_time = time.time()

        data = [item for sublist in results for item in sublist]

    print(json.dumps(data, indent=2, ensure_ascii=False))
    print(f'Script execution time: {end_time - start_time} seconds') # ~135.2969319820404 seconds

Using async batch requests with Queue:

from serpapi import YoutubeSearch
from urllib.parse import (parse_qsl, urlsplit)
from queue import Queue
import os, re, json
import time

# 50 queries
queries = [
    'burly',
    'creator',
    'doubtful',
    'chance',
    'capable',
    'window',
    'dynamic',
    'train',
    'worry',
    'useless',
    'steady',
    'thoughtful',
    'matter',
    'rotten',
    'overflow',
    'object',
    'far-flung',
    'gabby',
    'tiresome',
    'scatter',
    'exclusive',
    'wealth',
    'yummy',
    'play',
    'saw',
    'spiteful',
    'perform',
    'busy',
    'hypnotic',
    'sniff',
    'early',
    'mindless',
    'airplane',
    'distribution',
    'ahead',
    'good',
    'squeeze',
    'ship',
    'excuse',
    'chubby',
    'smiling',
    'wide',
    'structure',
    'wrap',
    'point',
    'file',
    'sack',
    'slope',
    'therapeutic',
    'disturbed'
]

search_queue = Queue()

for query in queries:
    params = {
        'api_key': '...',                 
        'engine': 'youtube',              
        'device': 'desktop',              
        'search_query': query,          
        'async': True,                   
        'no_cache': 'true'
    }

    search = YoutubeSearch(params)       # where data extraction happens
    results = search.get_dict()          # JSON -> Python dict
    
    if 'error' in results:
        print(results['error'])
        break

    print(f"Add search to the queue with ID: {results['search_metadata']}")
    search_queue.put(results)

data = []

start_time = time.time()

while not search_queue.empty():
    result = search_queue.get()
    search_id = result['search_metadata']['id']

    print(f'Get search from archive: {search_id}')
    search_archived = search.get_search_archive(search_id)
    
    print(f"Search ID: {search_id}, Status: {search_archived['search_metadata']['status']}")

    if re.search(r'Cached|Success', search_archived['search_metadata']['status']):
        for video_result in search_archived.get('video_results', []):
            data.append({
                'title': video_result.get('title'),
                'link': video_result.get('link'),
                'channel': video_result.get('channel').get('name'),
            })
    else:
        print(f'Requeue search: {search_id}')
        search_queue.put(result)
        
print(json.dumps(data, indent=2))
print('All searches completed')

execution_time = time.time() - start_time
print(f'Script execution time: {execution_time} seconds') # ~24.80349826812744 seconds
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant