pymysql retrievingdata in batches
Question:
i am using pymysql to interact with an sql db, here is the code:
query = "SELECT * FROM my_table"
c = db.cursor(pymysql.cursors.DictCursor)
c.execute(query)
c.fetchall()
now i am doing this for a database of 3000 enteries which isn’t alot, i plan to use this on a database with 15 million enteries and i want to take the data out in a loop as batches of 5000 enteries
i wonder if i need to do this using LIMIT and OFFSET in my query, or can i just do it using a method in the pymysql module… Thank You
Answers:
the way i’m doing this is, is like this:
query = "SELECT * FROM my_table"
c = db.cursor(pymysql.cursors.DictCursor)
c.execute(query)
limit = 5000
not_done = True
while not_done:
data = []
for _ in range(limit):
try:
data += [c.fetchone()]
except:
not_done = False
break
do_something_with_data()
No need to use limit
or offset
.
For pymysql
, all you need is pymysql.cursors.SSCursor
. With this cursor, the program will not load all data in memory first before you can process rows and write rows to disk.
Sample code:
(mysql credentials and import
s are omitted)
TEST_ROW_SIZE = 200000
connection = pymysql.connect(**config, cursorclass=pymysql.cursors.SSCursor)
with connection:
with connection.cursor() as cursor:
sql = f"select * from {table}"
cursor.execute(sql)
n = 0
with open('./pymysql-1row-write.csv', 'w') as out:
csv_out = csv.writer(out)
for row in cursor:
# print(row)
# do something with the row
csv_out.writerow(row)
n += 1
if n % 10000 == 0:
print(f"{n} rows dumped")
if n % TEST_ROW_SIZE == 0:
print("Test Done!")
exit(1)
Use fetchmany
TEST_ROW_SIZE = 200000
FETACH_ROW_SIZE = 10000
connection = pymysql.connect(**config, cursorclass=pymysql.cursors.SSCursor)
with connection:
with connection.cursor() as cursor:
sql = f"select * from {table}"
cursor.execute(sql)
n = 0
with open('./pymysql-fetchmany-write.csv', 'w') as out:
csv_out = csv.writer(out)
result = cursor.fetchmany(FETACH_ROW_SIZE)
while result:
# do something with fetched rows
csv_out.writerows(result)
result_len = len(result)
print(f"{result_len} rows dumped")
n += result_len
result = cursor.fetchmany(FETACH_ROW_SIZE)
if n % TEST_ROW_SIZE == 0:
print("Test Done!")
exit(2)
print("No data")
exit(1)
If you are using Mysql’s official python connector mysql-connector-python
, by default it is "unbuffered".
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
# try the buffered one and see what happens :)
# cursor = cnx.cursor(buffered=True)
sql = f"select * from {table}"
cursor.execute(sql)
n = 0
with open('./data-mysql-connector.csv', 'w') as out:
# with open('./data-mysql-connector-buffered.csv', 'w') as out:
csv_out = csv.writer(out)
for row in cursor:
# print(row)
# do something with the row
csv_out.writerow(row)
n += 1
if n % 10000 == 0:
print(f"{n} rows dumped")
mysqlclient
also has a SSCursor
but I didn’t try this connector.
SSCursor
A “server-side” cursor. Like Cursor but uses
CursorUseResultMixIn. Use only if you are dealing with potentially
large result sets
i am using pymysql to interact with an sql db, here is the code:
query = "SELECT * FROM my_table"
c = db.cursor(pymysql.cursors.DictCursor)
c.execute(query)
c.fetchall()
now i am doing this for a database of 3000 enteries which isn’t alot, i plan to use this on a database with 15 million enteries and i want to take the data out in a loop as batches of 5000 enteries
i wonder if i need to do this using LIMIT and OFFSET in my query, or can i just do it using a method in the pymysql module… Thank You
the way i’m doing this is, is like this:
query = "SELECT * FROM my_table"
c = db.cursor(pymysql.cursors.DictCursor)
c.execute(query)
limit = 5000
not_done = True
while not_done:
data = []
for _ in range(limit):
try:
data += [c.fetchone()]
except:
not_done = False
break
do_something_with_data()
No need to use limit
or offset
.
For pymysql
, all you need is pymysql.cursors.SSCursor
. With this cursor, the program will not load all data in memory first before you can process rows and write rows to disk.
Sample code:
(mysql credentials and import
s are omitted)
TEST_ROW_SIZE = 200000
connection = pymysql.connect(**config, cursorclass=pymysql.cursors.SSCursor)
with connection:
with connection.cursor() as cursor:
sql = f"select * from {table}"
cursor.execute(sql)
n = 0
with open('./pymysql-1row-write.csv', 'w') as out:
csv_out = csv.writer(out)
for row in cursor:
# print(row)
# do something with the row
csv_out.writerow(row)
n += 1
if n % 10000 == 0:
print(f"{n} rows dumped")
if n % TEST_ROW_SIZE == 0:
print("Test Done!")
exit(1)
Use fetchmany
TEST_ROW_SIZE = 200000
FETACH_ROW_SIZE = 10000
connection = pymysql.connect(**config, cursorclass=pymysql.cursors.SSCursor)
with connection:
with connection.cursor() as cursor:
sql = f"select * from {table}"
cursor.execute(sql)
n = 0
with open('./pymysql-fetchmany-write.csv', 'w') as out:
csv_out = csv.writer(out)
result = cursor.fetchmany(FETACH_ROW_SIZE)
while result:
# do something with fetched rows
csv_out.writerows(result)
result_len = len(result)
print(f"{result_len} rows dumped")
n += result_len
result = cursor.fetchmany(FETACH_ROW_SIZE)
if n % TEST_ROW_SIZE == 0:
print("Test Done!")
exit(2)
print("No data")
exit(1)
If you are using Mysql’s official python connector mysql-connector-python
, by default it is "unbuffered".
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
# try the buffered one and see what happens :)
# cursor = cnx.cursor(buffered=True)
sql = f"select * from {table}"
cursor.execute(sql)
n = 0
with open('./data-mysql-connector.csv', 'w') as out:
# with open('./data-mysql-connector-buffered.csv', 'w') as out:
csv_out = csv.writer(out)
for row in cursor:
# print(row)
# do something with the row
csv_out.writerow(row)
n += 1
if n % 10000 == 0:
print(f"{n} rows dumped")
mysqlclient
also has a SSCursor
but I didn’t try this connector.
SSCursor
A “server-side” cursor. Like Cursor but uses
CursorUseResultMixIn. Use only if you are dealing with potentially
large result sets