Trying to fake and rotating user agents

Question:

I am trying to fake user agents as well as rotate them in Python.
I found a tutorial online about how to do this with Scrapy using scrapy-useragents package.
I scrape the webpage, https://www.whatsmyua.info/, in order to check my user agent to see if it is different then mine and if it rotates. Is it different then my actual user agent but it does not rotate it returns the same user agent each time, and I cannot figure out what is going wrong.

settings.py

BOT_NAME = 'scrapy_javascript'

SPIDER_MODULES = ['scrapy_javascript.spiders']
NEWSPIDER_MODULE = 'scrapy_javascript.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapy_javascript (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
        'scrapy_splash.SplashCookiesMiddleware': 723,
        'scrapy_splash.SplashMiddleware': 725,
        'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}

# -----------------------------------------------------------------------------
# USER AGENT
# -----------------------------------------------------------------------------

DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': 500,
}


USER_AGENTS = [
    ('Mozilla/5.0 (X11; Linux x86_64) '
     'AppleWebKit/537.36 (KHTML, like Gecko) '
     'Chrome/57.0.2987.110 '
     'Safari/537.36'),  # chrome
    ('Mozilla/5.0 (X11; Linux x86_64) '
     'AppleWebKit/537.36 (KHTML, like Gecko) '
     'Chrome/61.0.3163.79 '
     'Safari/537.36'),  # chrome
    ('Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) '
     'Gecko/20100101 '
     'Firefox/55.0'),  # firefox
    ('Mozilla/5.0 (X11; Linux x86_64) '
     'AppleWebKit/537.36 (KHTML, like Gecko) '
     'Chrome/61.0.3163.91 '
     'Safari/537.36'),  # chrome
    ('Mozilla/5.0 (X11; Linux x86_64) '
     'AppleWebKit/537.36 (KHTML, like Gecko) '
     'Chrome/62.0.3202.89 '
     'Safari/537.36'),  # chrome
    ('Mozilla/5.0 (X11; Linux x86_64) '
     'AppleWebKit/537.36 (KHTML, like Gecko) '
     'Chrome/63.0.3239.108 '
     'Safari/537.36'),  # chrome
]

SPLASH_URL = 'http://199.89.192.74:8050'


DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
Asked By: Tim

||

Answers:

Figured it out by creating csv file with all my url’s and they are paired with IP’s and user agents and so every time I access the webpage I use those IP’s and user agents. Then I had to override my spalsh_url in my spider that way my splash_url would be equal to the proxy I am using at that moment.

SplashSpider.py

import csv
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import GameItem

# process the csv file so the url + ip address + useragent pairs are the same as defined in the file
# returns a list of dictionaries, example:
# [ {'url': 'http://www.starcitygames.com/catalog/category/Rivals%20of%20Ixalan',
#    'ip': 'http://204.152.114.244:8050',
#    'ua': "Mozilla/5.0 (BlackBerry; U; BlackBerry 9320; en-GB) AppleWebKit/534.11"},
#    ...
# ]
def process_csv(csv_file):
    data = []
    reader = csv.reader(csv_file)
    next(reader)
    for fields in reader:
        if fields[0] != "":
            url = fields[0]
        else:
            continue # skip the whole row if the url column is empty
        if fields[1] != "":
            ip = "http://" + fields[1] + ":8050" # adding http and port because this is the needed scheme
        if fields[2] != "":
            useragent = fields[2]
        data.append({"url": url, "ip": ip, "ua": useragent})
    return data


class MySpider(Spider):
    name = 'splash_spider'  # Name of Spider

    # notice that we don't need to define start_urls
    # just make sure to get all the urls you want to scrape inside start_requests function

    # getting all the url + ip address + useragent pairs then request them
    def start_requests(self):

        # get the file path of the csv file that contains the pairs from the settings.py
        with open(self.settings["PROXY_CSV_FILE"], mode="r") as csv_file:
           # requests is a list of dictionaries like this -> {url: str, ua: str, ip: str}
            requests = process_csv(csv_file)

        for req in requests:
            # no need to create custom middlewares
            # just pass useragent using the headers param, and pass proxy using the meta param

            yield SplashRequest(url=req["url"], callback=self.parse, args={"wait": 3},
                    headers={"User-Agent": req["ua"]},
                    splash_url = req["ip"],
                    )

settings.py

BOT_NAME = 'scrapy_javascript'

SPIDER_MODULES = ['scrapy_javascript.spiders']
NEWSPIDER_MODULE = 'scrapy_javascript.spiders'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# The path of the csv file that contains the pairs
PROXY_CSV_FILE = "proxies.csv"

DOWNLOADER_MIDDLEWARES = {
        'scrapy_splash.SplashCookiesMiddleware': 723,
        'scrapy_splash.SplashMiddleware': 725,
        'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}

#SPLASH_URL = 'http://127.0.0.1:8050'

#SPLASH_URL = 'http://localhost:8050'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'



# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 16

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 60
Answered By: Tim

Here you can find an API returning the most common user-agents as JSON :
http://51.158.74.109/useragents/?format=json

I have used this tool which will keep your list of user-agents always updated with most recent and most used user-agents : https://pypi.org/project/shadow-useragent/

     from shadow_useragent import ShadowUserAgent
     shadow_useragent = ShadowUserAgent()

     print(shadow_useragent.firefox)
     # Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0
     print(shadow_useragent.chrome)
     # Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36
     print(shadow_useragent.safari)
     # Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15
     print(shadow_useragent.edge)
     # Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134
     print(shadow_useragent.ie)
     # Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
     print(shadow_useragent.android)
     # Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30
     print(shadow_useragent.ipad)
     # Mozilla/5.0 (iPad; CPU OS 12_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Mobile/15E148 Safari/604.1
     print(shadow_useragent.random)
     # Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0

     print(shadow_useragent.random_nomobile)
     # Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36

     # and the best one, random via real world browser usage statistic
     print(ua.random)
     # Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36

     # if you want to excluse mobiles (some websites will display different pages)
     print(shadow_useragent.random_nomobile)
     # Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36
Answered By: SimonR

Check out https://www.useragents.me/. It is an auto-updating list of the latest useragents available in JSON format (auto-updating so, even if the site is neglected, the data shouldn’t go stale).

Conveniently they even offer a python coded useragent rotator function directly on the site (with useragents current as of November 4, 2022):

import random

def random_ua(k=1):
    # returns a random useragent from the latest user agents strings list, weighted
    # according to observed prevalance
    ua_pct = {"ua": {"0": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36", "1": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0", "2": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36", "3": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0", "4": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36", "5": "Mozilla/5.0 (X11; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0", "6": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36", "7": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36", "8": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36", "9": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15", "10": "Mozilla/5.0 (X11; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0", "11": "Mozilla/5.0 (Windows NT 10.0; rv:105.0) Gecko/20100101 Firefox/105.0", "12": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:105.0) Gecko/20100101 Firefox/105.0", "13": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0", "14": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"}, "pct": {"0": 28.8, "1": 13.28, "2": 10.98, "3": 8.55, "4": 6.25, "5": 5.56, "6": 4.53, "7": 4.27, "8": 3.57, "9": 2.93, "10": 2.99, "11": 2.55, "12": 2.44, "13": 1.7, "14": 1.59}}
    return random.choices(list(ua_pct['ua'].values()), list(ua_pct['pct'].values()), k=k)

# usage
random_ua() # optional k = number of useragents to return
Answered By: user20187835

One approach might be to use fake_useragent package and rotate it after each request.

Assuming virtualenv was created, install the required dependency pip install fake_useragent and then activate it.

The following code example presents how to achieve it (testing 3 calls and logging to a console to make sure the agent is rotated as expected):

import logging
import requests
from fake_useragent import UserAgent


def configure_logger(logger_name: str):
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.INFO)
    handler = logging.StreamHandler()
    formatter = logging.Formatter(
        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    )
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    return logger


class FakeWebScraper:
    """A simple fake web scraper class"""

    def __init__(self, logger=None):
        """Initialize the UserAgent object to be rotated later on during subsequent calls"""
        self.headers = {}
        self.logger = logger if logger else configure_logger(__name__)
        self.ua = UserAgent()

    def get_html(self, url: str) -> str:
        """Retrieve the HTML content of a web page using a GET request"""
        self.headers["User-Agent"] = user_agent = self.ua.random
        try:
            response = requests.get(url, headers=self.headers)
        except requests.ConnectionError as ex:
            logger.info(ex)
        finally:
            self.logger.info(
                f"User-Agent: {user_agent}, Status: {response.status_code}"
            )
        return response.text

# sample code to verify UserAgent is rotated
if __name__ == "__main__":
    logger = configure_logger(__name__)
    url = "https://www.google.com"

    scraper = FakeWebScraper(logger)

    num_of_web_calls = 3
    for _ in range(num_of_web_calls):
        html_content = scraper.get_html(url)

The result:

2023-01-16 19:19:10,670 - __main__ - INFO - User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0, Status: 200
2023-01-16 19:19:10,850 - __main__ - INFO - User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.2) Gecko/20060406 Firefox/1.5.0.2, Status: 200
2023-01-16 19:19:11,020 - __main__ - INFO - User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1) Opera 7.52  [en], Status: 200
Answered By: Patryk