How to change column datatype when loading csv in postgresql
Question:
I have a big script, the result is that the data is stored in a dataframe and then in csv. Then csv is opened and written to PostgreSQL. But there is a problem that the data type of one column is int4, and after opening csv the column format is ‘text’. I cannot change the data type in the database, they must be there exactly as int. Tell me pls how to do it.
total.to_csv("C:/Users/.../total_type19.csv", index = False, sep =';')
conn5 = psycopg2.connect(dbname='', user='',
password='', host='', port = '')
cursor5 = conn5.cursor()
with open("C:/Users/.../total_type19.csv", "r",encoding = 'utf-8') as file:
reader = csv.reader(file, delimiter = ";")
for row in reader:
# print(row)
cursor5.execute(
'INSERT INTO interaction_fillword (test_id,data_size,data_matrix, data_words_selection, data_colors, data_answers) VALUES(%s,%s, %s, %s, %s, %s)',
(row[0], row[1], row[2], row[3], row[4], row[5]))
conn5.commit()
The test_id column must be in int4 format
['312229', "['[{from:[3,4],to:[7,4],color:2},{from:[3,6],to:[10,6],color:3},{from:[5,8],to:[9,8],color:5},{from:[5,11],to:[10,11],color:6},{from:[1,0],to:[1,11],color:0},{from:[10,1],to:[10,6],color:4},{from:[3,0],to:[8,0],color:1}],']", '['["v","b","c","c","a","h","i","e","r","s","f","j"],["d","i","w","s","s","r","i","f","y","y","f","c"],["j","b","m","w","d","q","s","q","t","w","e","m"],["x","l","m","m","l","s","o","x","d","q","u","t"],["l","i","f","p","l","a","c","e","t","u","t","o"],["m","o","s","b","r","t","c","y","z","v","r","r"],["j","t","x","c","a","r","t","a","b","l","e","o"],["b","h","k","m","d","b","r","y","q","u","i","y"],["y","è","s","r","h","g","o","m","m","e","w","h"],["u","q","p","c","s","c","x","b","k","e","d","o"],["u","u","o","l","q","v","y","y","b","y","e","h"],["r","e","o","u","j","b","u","r","e","a","u","k"]],']', '['"#ff0000","#00fe00","#0000ff","#d2ea9a","#407f76","#211f95","#e1f233"']', '['"place","cartable","gomme","bureau","bibliothèque","feutre","cahier"']']
This is an example of one line from csv. Looks bad but that’s the way it should be
Answers:
Can you change your data to int or is it something like "m22" non-integer?
# to remove non-numeric digits from string
with open("C:/Users/.../total_type19.csv", "r",encoding = 'utf-8') as file:
reader = csv.reader(file, delimiter = ";")
header = next(reader )
print(f"HEADER {header}")
counter = 1 #or whatever number you want to start with
for row in reader:
print(row)
test_id =row[0]
test_id = ''.join([i for i in test_id if i.isdigit()])
if test_id == '':
counter +=1
test_id = counter
else:
test_id = int(test_id)
print(test_id)
cursor5.execute(
'INSERT INTO interaction_fillword (test_id,data_size,data_matrix, data_words_selection, data_colors, data_answers) VALUES(%s,%s, %s, %s, %s, %s)',
(test_id, row[1], row[2], row[3], row[4], row[5]))
Use copy_expert from `psycopg2.
import psycopg2
conn5 = psycopg2.connect(dbname='', user='',
password='', host='', port = '')
cursor5 = conn5.cursor()
with open("C:/Users/.../total_type19.csv", "r") as csv_file:
cursor5.copy_expert("COPY interaction_fillword FROM STDIN WITH CSV HEADER", csv_file)
The CSV HEADER
will do a couple of things:
- Skip the header line automatically.
- Take empty non-quoted strings as NULL.
copy_expert
uses the Postgres COPY to do bulk data import(or export) a lot quicker then inserting. The down side is that COPY
is all or nothing, either the entire import/export succeeds or a single error will rollback the entire thing.
I have a big script, the result is that the data is stored in a dataframe and then in csv. Then csv is opened and written to PostgreSQL. But there is a problem that the data type of one column is int4, and after opening csv the column format is ‘text’. I cannot change the data type in the database, they must be there exactly as int. Tell me pls how to do it.
total.to_csv("C:/Users/.../total_type19.csv", index = False, sep =';')
conn5 = psycopg2.connect(dbname='', user='',
password='', host='', port = '')
cursor5 = conn5.cursor()
with open("C:/Users/.../total_type19.csv", "r",encoding = 'utf-8') as file:
reader = csv.reader(file, delimiter = ";")
for row in reader:
# print(row)
cursor5.execute(
'INSERT INTO interaction_fillword (test_id,data_size,data_matrix, data_words_selection, data_colors, data_answers) VALUES(%s,%s, %s, %s, %s, %s)',
(row[0], row[1], row[2], row[3], row[4], row[5]))
conn5.commit()
The test_id column must be in int4 format
['312229', "['[{from:[3,4],to:[7,4],color:2},{from:[3,6],to:[10,6],color:3},{from:[5,8],to:[9,8],color:5},{from:[5,11],to:[10,11],color:6},{from:[1,0],to:[1,11],color:0},{from:[10,1],to:[10,6],color:4},{from:[3,0],to:[8,0],color:1}],']", '['["v","b","c","c","a","h","i","e","r","s","f","j"],["d","i","w","s","s","r","i","f","y","y","f","c"],["j","b","m","w","d","q","s","q","t","w","e","m"],["x","l","m","m","l","s","o","x","d","q","u","t"],["l","i","f","p","l","a","c","e","t","u","t","o"],["m","o","s","b","r","t","c","y","z","v","r","r"],["j","t","x","c","a","r","t","a","b","l","e","o"],["b","h","k","m","d","b","r","y","q","u","i","y"],["y","è","s","r","h","g","o","m","m","e","w","h"],["u","q","p","c","s","c","x","b","k","e","d","o"],["u","u","o","l","q","v","y","y","b","y","e","h"],["r","e","o","u","j","b","u","r","e","a","u","k"]],']', '['"#ff0000","#00fe00","#0000ff","#d2ea9a","#407f76","#211f95","#e1f233"']', '['"place","cartable","gomme","bureau","bibliothèque","feutre","cahier"']']
This is an example of one line from csv. Looks bad but that’s the way it should be
Can you change your data to int or is it something like "m22" non-integer?
# to remove non-numeric digits from string
with open("C:/Users/.../total_type19.csv", "r",encoding = 'utf-8') as file:
reader = csv.reader(file, delimiter = ";")
header = next(reader )
print(f"HEADER {header}")
counter = 1 #or whatever number you want to start with
for row in reader:
print(row)
test_id =row[0]
test_id = ''.join([i for i in test_id if i.isdigit()])
if test_id == '':
counter +=1
test_id = counter
else:
test_id = int(test_id)
print(test_id)
cursor5.execute(
'INSERT INTO interaction_fillword (test_id,data_size,data_matrix, data_words_selection, data_colors, data_answers) VALUES(%s,%s, %s, %s, %s, %s)',
(test_id, row[1], row[2], row[3], row[4], row[5]))
Use copy_expert from `psycopg2.
import psycopg2
conn5 = psycopg2.connect(dbname='', user='',
password='', host='', port = '')
cursor5 = conn5.cursor()
with open("C:/Users/.../total_type19.csv", "r") as csv_file:
cursor5.copy_expert("COPY interaction_fillword FROM STDIN WITH CSV HEADER", csv_file)
The CSV HEADER
will do a couple of things:
- Skip the header line automatically.
- Take empty non-quoted strings as NULL.
copy_expert
uses the Postgres COPY to do bulk data import(or export) a lot quicker then inserting. The down side is that COPY
is all or nothing, either the entire import/export succeeds or a single error will rollback the entire thing.