Scrape Tweet replies with Python and Selenium
Question:
I’m trying to scrape replies to public Tweets using Python.
I have the code below, which gets all replies displayed on the screen, but I am having trouble getting the rest of the replies that need scrolling.
The code works fine without the scroll loop, but once it is implemented, it just retrieves blank results.
Can someone please help me figure out why?
Tweet to be used as an example: https://twitter.com/BBCWorld/status/1535676092450840578
Code with scrolling loop:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import numpy as np
import pandas as pd
import time
driver = webdriver.Chrome(executable_path=r"C:UserAppDataLocalSeleniumBasicchromedriver") #find chrome drive in pc folder
driver.get("https://twitter.com/BBCWorld/status/1535676092450840578") #URL used as example
time.sleep(60)
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
tweets = []
while True:
for item in all_tweets[1:]: # skip tweet already scrapped
print('--- date ---')
try:
date = item.find_element(By.XPATH, './/time').text
except:
date = '[empty]'
print(date)
print('--- text ---')
try:
text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
except:
text = '[empty]'
print(text)
print('--- replying_to ---')
try:
replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
except:
replying_to = '[empty]'
print(replying_to)
tweets.append([date, replying_to, text])
time.sleep(3)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(3)
df = pd.DataFrame(tweets, columns=['Date of Tweet', 'Replying to', 'Tweet'])
df.to_csv(r'C:UserDownloadsTweets.csv', index=False, encoding='utf-8') #save a csv file in the downloads folder
print(df)
—UPDATE—
Based on the suggestion below, I’ve updated the code as follows, but I am only getting the first replies (i.e., the ones after scrolling are still missing):
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import numpy as np
import pandas as pd
import time
driver = webdriver.Chrome(executable_path=r"C:UsersAppDataLocalSeleniumBasicchromedriver")
driver.get("https://twitter.com/BBCWorld/status/1535676092450840578")
time.sleep(60)
tweets = []
result = False
old_height = driver.execute_script("return document.body.scrollHeight")
#set initial all_tweets to start loop
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
while result == False:
for item in all_tweets[1:]: # skip tweet already scrapped
print('--- date ---')
try:
date = item.find_element(By.XPATH, './/time').text
except:
date = '[empty]'
print(date)
print('--- text ---')
try:
text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
except:
text = '[empty]'
print(text)
print('--- replying_to ---')
try:
replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
except:
replying_to = '[empty]'
print(replying_to)
#Append new tweets replies to tweet array
tweets.append([date, replying_to, text])
#scroll down the page
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == old_height:
result = True
old_height = new_height
#update all_tweets to keep loop
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
df = pd.DataFrame(tweets, columns=['Date of Tweet', 'Replying to', 'Tweet'])
df.to_csv(r'C:UsersDownloadsTweets.csv', index=False, encoding='utf-8') #save a csv file in the downloads folder
print(df)
Answers:
You need to scroll to bottom first, then retrieve what you need.
Buttons that load new replies need to be clicked. Remove duplicates in the final list. It is also very important to let the page load completely before resuming the code run.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
import time
import itertools
driver = webdriver.Chrome(executable_path=r"C:UsersAppDataLocalSeleniumBasicchromedriver")
driver.get("https://twitter.com/BBCWorld/status/1535676092450840578")
time.sleep(5)
tweets = []
result = False
old_height = driver.execute_script("return document.body.scrollHeight")
#set initial all_tweets to start loop
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
while result == False:
for item in all_tweets[1:]: # skip tweet already scrapped
print('--- date ---')
try:
date = item.find_element(By.XPATH, './/time').text
except:
date = '[empty]'
print(date)
print('--- text ---')
try:
text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
except:
text = '[empty]'
print(text)
print('--- replying_to ---')
try:
replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
except:
replying_to = '[empty]'
print(replying_to)
#Append new tweets replies to tweet array
tweets.append([date, replying_to, text])
#scroll down the page
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
try:
try:
button = driver.find_element_by_css_selector("div.css-901oao.r-1cvl2hr.r-37j5jr.r-a023e6.r-16dba41.r-rjixqe.r-bcqeeo.r-q4m81j.r-qvutc0")
except:
button = driver.find_element_by_css_selector("div.css-1dbjc4n.r-1ndi9ce") #there are two kinds of buttons
ActionChains(driver).move_to_element(button).click(button).perform()
time.sleep(2)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
except:
pass
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == old_height:
result = True
old_height = new_height
#update all_tweets to keep loop
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
tweets = tweets.sort()
tweets = list(k for k in itertools.groupby(tweets)) #remove duplicates from final list
I am happy to share that I finally found a solution to the above query!
It’s not perfect (as it doesn’t load hidden replies and only scrappes the main reply, i.e, doesn’t consider the sub-replies), but it was enough for my current needs.
So, fell free to use it, but keep these limitations in mind 🙂
#Do imports
import numpy as np
import pandas as pd
import time
import selenium
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# Set driver and initial array
driver = webdriver.Chrome(executable_path=r"C:Usersyour_userAppDataLocalSeleniumBasicchromedriver") #change parameters to your user and folder structure
driver.get("the url you want to scrappe") #input the url you wanna scrappe here
time.sleep(10) #change according to your pc and internet connection
tweets = []
result = False
# Get scroll height after first time page load
last_height = driver.execute_script("return document.body.scrollHeight")
last_elem=''
current_elem=''
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(6)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
#update all_tweets to keep loop
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
for item in all_tweets[1:]: # skip tweet already scrapped
print('--- date ---')
try:
date = item.find_element(By.XPATH, './/time').text
except:
date = '[empty]'
print(date)
print('--- text ---')
try:
text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
except:
text = '[empty]'
print(text)
print('--- replying_to ---')
try:
replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
except:
replying_to = '[empty]'
print(replying_to)
#Append new tweets replies to tweet array
tweets.append([username, replying_to, text, date])
if (last_elem == current_elem):
result = True
else:
last_elem = current_elem
df = pd.DataFrame(tweets, columns=['Replying to', 'Tweet', 'Date of Tweet'])
df.to_csv(r'C:Usersyour_userDownloadsTweets.csv', index=False, encoding='utf-8') #save a csv file in the downloads folder, change it to your structure and desired folder
print(df)
did you find a solution for the problem above, so that it scrapes all replies? Would be awesome if you could s
I’m trying to scrape replies to public Tweets using Python.
I have the code below, which gets all replies displayed on the screen, but I am having trouble getting the rest of the replies that need scrolling.
The code works fine without the scroll loop, but once it is implemented, it just retrieves blank results.
Can someone please help me figure out why?
Tweet to be used as an example: https://twitter.com/BBCWorld/status/1535676092450840578
Code with scrolling loop:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import numpy as np
import pandas as pd
import time
driver = webdriver.Chrome(executable_path=r"C:UserAppDataLocalSeleniumBasicchromedriver") #find chrome drive in pc folder
driver.get("https://twitter.com/BBCWorld/status/1535676092450840578") #URL used as example
time.sleep(60)
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
tweets = []
while True:
for item in all_tweets[1:]: # skip tweet already scrapped
print('--- date ---')
try:
date = item.find_element(By.XPATH, './/time').text
except:
date = '[empty]'
print(date)
print('--- text ---')
try:
text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
except:
text = '[empty]'
print(text)
print('--- replying_to ---')
try:
replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
except:
replying_to = '[empty]'
print(replying_to)
tweets.append([date, replying_to, text])
time.sleep(3)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(3)
df = pd.DataFrame(tweets, columns=['Date of Tweet', 'Replying to', 'Tweet'])
df.to_csv(r'C:UserDownloadsTweets.csv', index=False, encoding='utf-8') #save a csv file in the downloads folder
print(df)
—UPDATE—
Based on the suggestion below, I’ve updated the code as follows, but I am only getting the first replies (i.e., the ones after scrolling are still missing):
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import numpy as np
import pandas as pd
import time
driver = webdriver.Chrome(executable_path=r"C:UsersAppDataLocalSeleniumBasicchromedriver")
driver.get("https://twitter.com/BBCWorld/status/1535676092450840578")
time.sleep(60)
tweets = []
result = False
old_height = driver.execute_script("return document.body.scrollHeight")
#set initial all_tweets to start loop
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
while result == False:
for item in all_tweets[1:]: # skip tweet already scrapped
print('--- date ---')
try:
date = item.find_element(By.XPATH, './/time').text
except:
date = '[empty]'
print(date)
print('--- text ---')
try:
text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
except:
text = '[empty]'
print(text)
print('--- replying_to ---')
try:
replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
except:
replying_to = '[empty]'
print(replying_to)
#Append new tweets replies to tweet array
tweets.append([date, replying_to, text])
#scroll down the page
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == old_height:
result = True
old_height = new_height
#update all_tweets to keep loop
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
df = pd.DataFrame(tweets, columns=['Date of Tweet', 'Replying to', 'Tweet'])
df.to_csv(r'C:UsersDownloadsTweets.csv', index=False, encoding='utf-8') #save a csv file in the downloads folder
print(df)
You need to scroll to bottom first, then retrieve what you need.
Buttons that load new replies need to be clicked. Remove duplicates in the final list. It is also very important to let the page load completely before resuming the code run.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
import time
import itertools
driver = webdriver.Chrome(executable_path=r"C:UsersAppDataLocalSeleniumBasicchromedriver")
driver.get("https://twitter.com/BBCWorld/status/1535676092450840578")
time.sleep(5)
tweets = []
result = False
old_height = driver.execute_script("return document.body.scrollHeight")
#set initial all_tweets to start loop
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
while result == False:
for item in all_tweets[1:]: # skip tweet already scrapped
print('--- date ---')
try:
date = item.find_element(By.XPATH, './/time').text
except:
date = '[empty]'
print(date)
print('--- text ---')
try:
text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
except:
text = '[empty]'
print(text)
print('--- replying_to ---')
try:
replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
except:
replying_to = '[empty]'
print(replying_to)
#Append new tweets replies to tweet array
tweets.append([date, replying_to, text])
#scroll down the page
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
try:
try:
button = driver.find_element_by_css_selector("div.css-901oao.r-1cvl2hr.r-37j5jr.r-a023e6.r-16dba41.r-rjixqe.r-bcqeeo.r-q4m81j.r-qvutc0")
except:
button = driver.find_element_by_css_selector("div.css-1dbjc4n.r-1ndi9ce") #there are two kinds of buttons
ActionChains(driver).move_to_element(button).click(button).perform()
time.sleep(2)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
except:
pass
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == old_height:
result = True
old_height = new_height
#update all_tweets to keep loop
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
tweets = tweets.sort()
tweets = list(k for k in itertools.groupby(tweets)) #remove duplicates from final list
I am happy to share that I finally found a solution to the above query!
It’s not perfect (as it doesn’t load hidden replies and only scrappes the main reply, i.e, doesn’t consider the sub-replies), but it was enough for my current needs.
So, fell free to use it, but keep these limitations in mind 🙂
#Do imports
import numpy as np
import pandas as pd
import time
import selenium
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# Set driver and initial array
driver = webdriver.Chrome(executable_path=r"C:Usersyour_userAppDataLocalSeleniumBasicchromedriver") #change parameters to your user and folder structure
driver.get("the url you want to scrappe") #input the url you wanna scrappe here
time.sleep(10) #change according to your pc and internet connection
tweets = []
result = False
# Get scroll height after first time page load
last_height = driver.execute_script("return document.body.scrollHeight")
last_elem=''
current_elem=''
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(6)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
#update all_tweets to keep loop
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
for item in all_tweets[1:]: # skip tweet already scrapped
print('--- date ---')
try:
date = item.find_element(By.XPATH, './/time').text
except:
date = '[empty]'
print(date)
print('--- text ---')
try:
text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
except:
text = '[empty]'
print(text)
print('--- replying_to ---')
try:
replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
except:
replying_to = '[empty]'
print(replying_to)
#Append new tweets replies to tweet array
tweets.append([username, replying_to, text, date])
if (last_elem == current_elem):
result = True
else:
last_elem = current_elem
df = pd.DataFrame(tweets, columns=['Replying to', 'Tweet', 'Date of Tweet'])
df.to_csv(r'C:Usersyour_userDownloadsTweets.csv', index=False, encoding='utf-8') #save a csv file in the downloads folder, change it to your structure and desired folder
print(df)
did you find a solution for the problem above, so that it scrapes all replies? Would be awesome if you could s