How can I make python request module asyncio and aiohttp

Question:

def get_ship_position(ship_id):
    import requests

    url ="https://www.marinetraffic.com/en/vesselDetails/vesselInfo/shipid:{}".format(ship_id)
    

    headers = {
        "accept": "application/json",
        "accept-encoding": "gzip, deflate",
        "user-agent": "Mozilla/5.0",
        "x-requested-with": "XMLHttpRequest"
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()

    return response.json()


def main():
    from time import perf_counter
    start = perf_counter()
    i = 7550
    while (i <= 9999 ):
        i+=1
        try:

            data = get_ship_position(i)
            with open("marinetraffic.txt","a",encoding="utf-8") as bos:  
                print("{}t{}t{}t{}t{}t{}t{}t{}x{}t{}t{}t{}t{}t{}t{}".format(data["mmsi"], data["imo"],data["name"],data["nameAis"],data["type"],data["typeSpecific"],data["yearBuilt"],data["length"],data["breadth"],data["callsign"],data["country"],data["deadweight"],data["grossTonnage"],data["homePort"],data["status"]),file=bos) 
                print(i,"Yazdı")

        except Exception:
            print(i,"Hata")
            with open("marinetraffichata.txt","a",encoding="utf-8") as hata:
                print("Hata",i,file=hata)
            pass
    stop = perf_counter()        
    print("çalışılan süre:", stop - start,"saniye")

           # return 0


if __name__ == "__main__":
    import sys
    sys.exit(main())

I am progressing very slowly with the request module, how can I make the code run fast? I’ve seen the aiohttp and async modules and they are really fast. How can I adapt my own code?

Asked By: DeLee

||

Answers:

Using asyncio and aiohttp is certainly one way of being able to do concurrent URL retrievals. But I am wondering if it is the best way given (1) you are already using requests and (2) you want to retrieve 2450 URLs, but not necessarily all at the same time.

By using a multithreading pool of size N, you would have N threads concurrently retrieving up to N URLs. By setting an "appropriate" value for N you can control the degree of concurrency. Performance could improve by increasing N but at some point as N got larger, performance could start to decrease. There is also the possibility that the website might think you are performing a Denial of Service attack by making so many concurrent requests.

In the code below I am using a value of 64 for N and creating a Session instance for doing the retrievals, which should also improve performance. I am using method multiprocessing.pool.Threadpool.imap to process the returned data elements as they become available. This method returns an iterator that when iterated will return the next return value from your worker function, get_ship_position. However, I am explicitly using method next to iterate so that I can individually handle exceptions that are raised by get_ship_posiion. If I were to use instead for data in pool.imap(worker, range(7551, 10_001)) to iterate then once an exception is raised by an invocation of get_ship_position, I would not be able to continue iterating subsequent results.

def get_ship_position(session, ship_id):

    url ="https://www.marinetraffic.com/en/vesselDetails/vesselInfo/shipid:{}".format(ship_id)



    response = session.get(url)
    response.raise_for_status()

    return response.json()


def main():
    from time import perf_counter
    import requests
    from multiprocessing.pool import ThreadPool
    from functools import partial

    start = perf_counter()

    with requests.Session() as session:
        headers = {
            "accept": "application/json",
            "accept-encoding": "gzip, deflate",
            "user-agent": "Mozilla/5.0",
            "x-requested-with": "XMLHttpRequest"
        }
        session.headers = headers

        with ThreadPool(64) as pool:
            worker = partial(get_ship_position, session)
            it = pool.imap(worker, range(7551, 10_001))
            i = 7550
            with open("marinetraffic.txt","a",encoding="utf-8") as f:
                while True:
                    i += 1
                    try:
                        data = next(it)
                        print("{}t{}t{}t{}t{}t{}t{}t{}x{}t{}t{}t{}t{}t{}t{}".format(data["mmsi"], data["imo"],data["name"],data["nameAis"],data["type"],data["typeSpecific"],data["yearBuilt"],data["length"],data["breadth"],data["callsign"],data["country"],data["deadweight"],data["grossTonnage"],data["homePort"],data["status"]),file=f)
                        print(i,"Yazdı")
                    except StopIteration:
                        break
                    except Exception:
                        print(i,"Hata")
                        print("Hata",i,file=f)

    stop = perf_counter()
    print("çalışılan süre:", stop - start,"saniye")

    return 0


if __name__ == "__main__":
    import sys
    sys.exit(main())

Using asyncio and aiohttp

The following code uses asyncio and aiohttp. A semaphore set to 64 controls the number of coroutines that can be running concurrently so that you can control the number of concurrent get requests are made. Again, this number, set to 64, can be adjusted to see how performance varies.

import asyncio

async def get_ship_position(session, ship_id):

    url ="https://www.marinetraffic.com/en/vesselDetails/vesselInfo/shipid:{}".format(ship_id)
    async with session.get(url) as response:
        status = response.status
        if status != 200:
            raise Exception(f'Bad status: {status}')
        return await response.json()

async def bounded_fetch(sem, session, ship_id):
    async with sem:
        result = await get_ship_position(session, ship_id)
        return result

async def main():
    from time import perf_counter
    import aiohttp

    start = perf_counter()

    headers = {
        "accept": "application/json",
        "accept-encoding": "gzip, deflate",
        "user-agent": "Mozilla/5.0",
        "x-requested-with": "XMLHttpRequest"
    }
    async with aiohttp.ClientSession(headers=headers) as session:
        sem = asyncio.Semaphore(64)
        responses = await asyncio.gather(*(bounded_fetch(sem, session, i) for i in range(7551, 10_001)), return_exceptions=True)
        with open("marinetraffic.txt","a",encoding="utf-8") as f:
            for i, data in enumerate(responses, start=7551):
                if isinstance(data, Exception):
                    print(i,"Hata")
                    print("Hata",i,file=f)
                else:
                    print("{}t{}t{}t{}t{}t{}t{}t{}x{}t{}t{}t{}t{}t{}t{}".format(data["mmsi"], data["imo"],data["name"],data["nameAis"],data["type"],data["typeSpecific"],data["yearBuilt"],data["length"],data["breadth"],data["callsign"],data["country"],data["deadweight"],data["grossTonnage"],data["homePort"],data["status"]),file=f)
                    print(i,"Yazdı")

    stop = perf_counter()
    print("çalışılan süre:", stop - start,"saniye")

    return 0


if __name__ == "__main__":
    import sys
    rc = asyncio.get_event_loop().run_until_complete(main())
    sys.exit(rc)

Note

With either version successive runs can produce widely different run times.

Update

If you want to write results to the output file as results are returned instead of after all coroutines have completed then try:

   ... # code omitted
    import aiofiles

    async with aiohttp.ClientSession(headers=headers) as session:
        sem = asyncio.Semaphore(64)
        tasks = [asyncio.create_task(bounded_fetch(sem, session, i)) for i in range(7551, 10_001)]
        async with aiofiles.open("marinetraffic.txt", "w", encoding="utf-8") as f:
            for i, task in enumerate(tasks, start=7551):
                try:
                    await task
                    data = task.result()
                    record = "{}t{}t{}t{}t{}t{}t{}t{}x{}t{}t{}t{}t{}t{}t{}n".format(data["mmsi"], data["imo"],data["name"],data["nameAis"],data["type"],data["typeSpecific"],data["yearBuilt"],data["length"],data["breadth"],data["callsign"],data["country"],data["deadweight"],data["grossTonnage"],data["homePort"],data["status"])
                    await f.write(record)
                    print(i,"Yazdı")
                except:
                    print(i,"Hata")
                    print("Hata",i,file=f)

                    print(i,"Yazdı")
                except:
                    print(i,"Hata")
                    print("Hata",i,file=f)
    ... # code omitted
Answered By: Booboo
Categories: questions Tags:
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.