How to get total number of posts of a subreddit using Python?

Question:

I am working on a project where I have to scrap subreddit using PRAW. But I have to put limit so that it will scrap only that many posts. For example, if I want to scrap a subreddit gaming (https://www.reddit.com/r/gaming/) I have to give limit 100 so it scrap for first 100 posts. But instead, I want first the total number of posts in gaming subreddit and then that value I can set as a limit to extract all the posts. I have searched on internet about Pushshift API, but don’t know how to do that. Any help is appreciated!

Following code:

import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from psaw import PushshiftAPI

load_dotenv(find_dotenv())

#Creating a dataframe
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])

#Instance of subreddit to be web scraped
reddit_read_only = praw.Reddit(client_id = os.environ.get("client_id"),
                                client_secret = os.environ.get("client_secret"),
                                user_agent = os.environ.get("user_agent"))

def main(name, value):
    i = 0
    subreddit = reddit_read_only.subreddit(name)
    print(subreddit.created)
    while i < value:
        #Limits the scrapping for value number of posts
        for submission in subreddit.hot(limit=value):
            submission.comments.replace_more(limit=(value*30))
            lst = []
            #If there are any comments, then it will be saved in dataframe
            if submission.num_comments != 0:
                for comment in submission.comments.list():
                    lst.append(comment.body) 
                df.loc[i] = [submission.title, submission.num_comments, lst]
            
            #If there are no comments in a post, then No comments will be stored 
            elif submission.num_comments == 0:
                df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
            i += 1
    # print(df)
    name = 'Reddit_web_scrap_'+str(name) #save the file with certain name
    # df.to_csv(name + str('.csv'), index=False)

    return name

if __name__ == "__main__":

    print('#####################################################################')
    print('############### Reddit Web Scrapping Started ########################')
    print('#####################################################################')
    print()
    name = main('gaming', 50)
    print()
    print('Created {}.csv file!'.format(name))
    print()
    print('#####################################################################')
    print('################# Reddit Web Scrapping Ended ########################')
    print('#####################################################################')

I have put limit to 50 which will scrap first 50 posts. But I want to scrap all the posts that is available in gaming. If I put limit = "None", then it will throw me an error:

TypeError: '<' not supported between instances of 'int' and 'str'

And this is logical as well. So, I guess I won’t be able to use limit = "None".

Asked By: Hardik_Zalavadiya

||

Answers:

The ‘limit’ argument is optional. However, setting the ‘limit’ argument to ‘None’ is also an option. If possible, please provide the code you already have.

Answered By: Mats

I have created a function total_posts() with the help of Pushshift API, that will give me total number of posts avaialble for a particular subreddit.

#Importing Dependencies
import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from pmaw import PushshiftAPI

load_dotenv(find_dotenv())

#Creating a dataframe
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])

#Instance of subreddit to be web scraped
reddit_read_only = praw.Reddit(client_id = os.environ.get("client_id"),
                                client_secret = os.environ.get("client_secret"),
                                user_agent = os.environ.get("user_agent"))

def total_posts(name):
    print("Calculating total number of posts")
    print()
    api = PushshiftAPI()
    api_request_generator = api.search_submissions(subreddit='ChatGPT', score = ">=0")
    aita_submissions = pd.DataFrame([submission for submission in api_request_generator])
    print("Total number of posts in subreddit {} are {}".format(name, aita_submissions.shape[0]))

    return aita_submissions.shape[0]

def main(name, value):
    print('Creating dataframe')
    print()
    i = 0
    subreddit = reddit_read_only.subreddit(name)
    while i < value:
        #Limits the scrapping for value number of posts
        for submission in subreddit.hot(limit=value):
            submission.comments.replace_more(limit=(value*30))
            lst = []
            #If there are any comments, then it will be saved in dataframe
            if submission.num_comments != 0:
                for comment in submission.comments.list():
                    lst.append(comment.body) 
                df.loc[i] = [submission.title, submission.num_comments, lst]
            
            #If there are no comments in a post, then No comments will be stored 
            elif submission.num_comments == 0:
                df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
            i += 1
    print(df)
    name = 'Reddit_web_scrap_'+str(name) #save the file with certain name
    df.to_csv(name + str('.csv'), index=False)

if __name__ == "__main__":
    
    subreddit_name = 'gaming'

    print('#####################################################################')
    print('#### Reddit Web Scrapping Started for {}'.format(subreddit_name) + '####')
    print('#####################################################################')
    print()
    posts_number = total_posts(subreddit_name)
    print()
    main(subreddit_name, posts_number)
    print()
    print('Created {}.csv file!'.format(subreddit_name))
    print()
    print('#####################################################################')
    print('################# Reddit Web Scrapping Ended ########################')
    print('#####################################################################')
Answered By: SmitShah_19
Categories: questions Tags: , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.