Convert the string 2.90K to 2900 or 5.2M to 5200000 in pandas dataframe
Question:
Need some help on processing data inside a pandas dataframe.
Any help is most welcome.
I have OHCLV data in CSV format. I have loaded the file in to pandas dataframe.
How do I convert the volume column from 2.90K to 2900 or 5.2M to 5200000.
The column can contain both K in form of thousands and M in millions.
import pandas as pd
file_path = '/home/fatjoe/UCHM.csv'
df = pd.read_csv(file_path, parse_dates=[0], index_col=0)
df.columns = [
"closing_price",
"opening_price",
"high_price",
"low_price",
"volume",
"change"]
df['opening_price'] = df['closing_price']
df['opening_price'] = df['opening_price'].shift(-1)
df = df.replace('-', 0)
df = df[:-1]
print(df.head())
Console:
Date
2016-09-23 0
2016-09-22 9.60K
2016-09-21 54.20K
2016-09-20 115.30K
2016-09-19 18.90K
2016-09-16 176.10K
2016-09-15 31.60K
2016-09-14 10.00K
2016-09-13 3.20K
Answers:
assuming you have the following DF:
In [30]: df
Out[30]:
Date Val
0 2016-09-23 100
1 2016-09-22 9.60M
2 2016-09-21 54.20K
3 2016-09-20 115.30K
4 2016-09-19 18.90K
5 2016-09-16 176.10K
6 2016-09-15 31.60K
7 2016-09-14 10.00K
8 2016-09-13 3.20M
you can do it this way:
In [31]: df.Val = (df.Val.replace(r'[KM]+$', '', regex=True).astype(float) *
....: df.Val.str.extract(r'[d.]+([KM]+)', expand=False)
....: .fillna(1)
....: .replace(['K','M'], [10**3, 10**6]).astype(int))
In [32]: df
Out[32]:
Date Val
0 2016-09-23 100.0
1 2016-09-22 9600000.0
2 2016-09-21 54200.0
3 2016-09-20 115300.0
4 2016-09-19 18900.0
5 2016-09-16 176100.0
6 2016-09-15 31600.0
7 2016-09-14 10000.0
8 2016-09-13 3200000.0
Explanation:
In [36]: df.Val.replace(r'[KM]+$', '', regex=True).astype(float)
Out[36]:
0 100.0
1 9.6
2 54.2
3 115.3
4 18.9
5 176.1
6 31.6
7 10.0
8 3.2
Name: Val, dtype: float64
In [37]: df.Val.str.extract(r'[d.]+([KM]+)', expand=False)
Out[37]:
0 NaN
1 M
2 K
3 K
4 K
5 K
6 K
7 K
8 M
Name: Val, dtype: object
In [38]: df.Val.str.extract(r'[d.]+([KM]+)', expand=False).fillna(1)
Out[38]:
0 1
1 M
2 K
3 K
4 K
5 K
6 K
7 K
8 M
Name: Val, dtype: object
In [39]: df.Val.str.extract(r'[d.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [10**3, 10**6]).astype(int)
Out[39]:
0 1
1 1000000
2 1000
3 1000
4 1000
5 1000
6 1000
7 1000
8 1000000
Name: Val, dtype: int32
def value_to_float(x):
if type(x) == float or type(x) == int:
return x
if 'K' in x:
if len(x) > 1:
return float(x.replace('K', '')) * 1000
return 1000.0
if 'M' in x:
if len(x) > 1:
return float(x.replace('M', '')) * 1000000
return 1000000.0
if 'B' in x:
return float(x.replace('B', '')) * 1000000000
return 0.0
df['col'] = df['col'].apply(value_to_float)
DataFrame.replace
with pd.eval
I like MaxU’s answer. You can considerably shorten this using pd.eval
:
df['Val'].replace({'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
0 100
1 9600000
2 54200
3 115300
4 18900
5 176100
6 31600
7 10000
8 3200000
Name: Val, dtype: int64
Slight modification will make this case insensitive as well:
repl_dict = {'[kK]': '*1e3', '[mM]': '*1e6', '[bB]': '*1e9', }
df['Val'].replace(repl_dict, regex=True).map(pd.eval)
0 100.0
1 9600000.0
2 54200.0
3 115300.0
4 18900.0
5 176100.0
6 31600.0
7 10000.0
8 3200000.0
Name: Val, dtype: float64
Explanation
Assuming "Val" is a column of strings, the replace
operation yields,
df['Val'].replace({'K': '*1e3', 'M': '*1e6'}, regex=True)
0 100
1 9.60*1e6
2 54.20*1e3
3 115.30*1e3
4 18.90*1e3
5 176.10*1e3
6 31.60*1e3
7 10.00*1e3
8 3.20*1e6
Name: Val, dtype: object
This is an arithmetic expression which pd.eval
can evaluate!
_ .map(pd.eval)
0 100.0
1 9600000.0
2 54200.0
3 115300.0
4 18900.0
5 176100.0
6 31600.0
7 10000.0
8 3200000.0
Name: Val, dtype: float64
To further generalize cs95’s answer I would do this:
df['Val'].replace({'K': '*1e3', 'M': '*1e6', '-':'-1'}, regex=True).map(pd.eval).astype(int)
since on some numeric values pd.eval has to multiply ‘-‘ by some other number which will result an error. (could not convert string to float ‘-‘)
You can use numerize
library, too easy!
pip install numerize
Use
print(numerize.numerize(1000))
print(numerize.numerize(100000))
print(numerize.numerize(1234567))
print(numerize.numerize(123456789))
Result:
1K
100K
1.23M
123.46M
def value_to_float(x):
try:
x = x.upper()
if 'CEN' in x:
return float(x.replace('CEN', '')) * 10**303
elif 'GO' in x:
return float(x.replace('GO', '')) * 10**100
elif 'QIT' in x:
return float(x.replace('QIT', '')) * 10**84
elif 'QAT' in x:
return float(x.replace('QAT', '')) * 10**45
elif 'TE' in x:
return float(x.replace('TE', '')) * 10**42
elif 'DU' in x:
return float(x.replace('DU', '')) * 10**39
elif 'UN' in x:
return float(x.replace('UN', '')) * 10**36
elif 'DE' in x:
return float(x.replace('DE', '')) * 10**33
elif 'NO' in x:
return float(x.replace('NO', '')) * 10**30
elif 'OC' in x:
return float(x.replace('OC', '')) * 10**27
elif 'SP' in x:
return float(x.replace('SP', '')) * 10**24
elif 'SX' in x:
return float(x.replace('SX', '')) * 10**21
elif 'QI' in x:
return float(x.replace('QI', '')) * 10**18
elif 'QA' in x:
return float(x.replace('QA', '')) * 10**15
elif 'T' in x:
return float(x.replace('T', '')) * 10**12
elif 'B' in x:
return float(x.replace('B', '')) * 10**9
elif 'M' in x:
return float(x.replace('M', '')) * 10**6
elif 'K' in x:
return float(x.replace('K', '')) * 10**3
else:
return float(x)
return 0.0
except Exception:
return 0.0
def float_to_value(x: float):
try:
if x > 10**303-1:
return str(x/10**303) + 'CEN'
elif x > 10**100-1:
return str(x/10**100) + 'GO'
elif x > 10**84-1:
return str(x/10**84) + 'QIT'
elif x > 10**45-1:
return str(x/10**45) + 'QAT'
elif x > 10**42-1:
return str(x/10**42) + 'TE'
elif x > 10**39-1:
return str(x/10**39) + 'DU'
elif x > 10**36-1:
return str(x/10**36) + 'UN'
elif x > 10**33-1:
return str(x/10**33) + 'DE'
elif x > 10**30-1:
return str(x/10**30) + 'NO'
elif x > 10**27-1:
return str(x/10**27) + 'OC'
elif x > 10**24-1:
return str(x/10**24) + 'SP'
elif x > 10**21-1:
return str(x/10**21) + 'SX'
elif x > 10**18-1:
return str(x/10**18) + 'QI'
elif x > 10**15-1:
return str(x/10**15) + 'QA'
elif x > 10**12-1:
return str(x/10**12) + 'T'
elif x > 10**9-1:
return str(x/10**9) + 'B'
elif x > 10**6-1:
return str(x/10**6) + 'M'
elif x > 10**3-1:
return str(x/10**3) + 'K'
else:
return str(x)
except Exception as err:
return '0.0'
df["Val"] = df["Val"].replace(['€','K','M','.'] ,'' , regex=True).astype(int) * df["Val"].replace(['€','.','K','M'] ,['','',1000,1000000] , regex=True).astype(int)
Need some help on processing data inside a pandas dataframe.
Any help is most welcome.
I have OHCLV data in CSV format. I have loaded the file in to pandas dataframe.
How do I convert the volume column from 2.90K to 2900 or 5.2M to 5200000.
The column can contain both K in form of thousands and M in millions.
import pandas as pd
file_path = '/home/fatjoe/UCHM.csv'
df = pd.read_csv(file_path, parse_dates=[0], index_col=0)
df.columns = [
"closing_price",
"opening_price",
"high_price",
"low_price",
"volume",
"change"]
df['opening_price'] = df['closing_price']
df['opening_price'] = df['opening_price'].shift(-1)
df = df.replace('-', 0)
df = df[:-1]
print(df.head())
Console:
Date
2016-09-23 0
2016-09-22 9.60K
2016-09-21 54.20K
2016-09-20 115.30K
2016-09-19 18.90K
2016-09-16 176.10K
2016-09-15 31.60K
2016-09-14 10.00K
2016-09-13 3.20K
assuming you have the following DF:
In [30]: df
Out[30]:
Date Val
0 2016-09-23 100
1 2016-09-22 9.60M
2 2016-09-21 54.20K
3 2016-09-20 115.30K
4 2016-09-19 18.90K
5 2016-09-16 176.10K
6 2016-09-15 31.60K
7 2016-09-14 10.00K
8 2016-09-13 3.20M
you can do it this way:
In [31]: df.Val = (df.Val.replace(r'[KM]+$', '', regex=True).astype(float) *
....: df.Val.str.extract(r'[d.]+([KM]+)', expand=False)
....: .fillna(1)
....: .replace(['K','M'], [10**3, 10**6]).astype(int))
In [32]: df
Out[32]:
Date Val
0 2016-09-23 100.0
1 2016-09-22 9600000.0
2 2016-09-21 54200.0
3 2016-09-20 115300.0
4 2016-09-19 18900.0
5 2016-09-16 176100.0
6 2016-09-15 31600.0
7 2016-09-14 10000.0
8 2016-09-13 3200000.0
Explanation:
In [36]: df.Val.replace(r'[KM]+$', '', regex=True).astype(float)
Out[36]:
0 100.0
1 9.6
2 54.2
3 115.3
4 18.9
5 176.1
6 31.6
7 10.0
8 3.2
Name: Val, dtype: float64
In [37]: df.Val.str.extract(r'[d.]+([KM]+)', expand=False)
Out[37]:
0 NaN
1 M
2 K
3 K
4 K
5 K
6 K
7 K
8 M
Name: Val, dtype: object
In [38]: df.Val.str.extract(r'[d.]+([KM]+)', expand=False).fillna(1)
Out[38]:
0 1
1 M
2 K
3 K
4 K
5 K
6 K
7 K
8 M
Name: Val, dtype: object
In [39]: df.Val.str.extract(r'[d.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [10**3, 10**6]).astype(int)
Out[39]:
0 1
1 1000000
2 1000
3 1000
4 1000
5 1000
6 1000
7 1000
8 1000000
Name: Val, dtype: int32
def value_to_float(x):
if type(x) == float or type(x) == int:
return x
if 'K' in x:
if len(x) > 1:
return float(x.replace('K', '')) * 1000
return 1000.0
if 'M' in x:
if len(x) > 1:
return float(x.replace('M', '')) * 1000000
return 1000000.0
if 'B' in x:
return float(x.replace('B', '')) * 1000000000
return 0.0
df['col'] = df['col'].apply(value_to_float)
DataFrame.replace
with pd.eval
I like MaxU’s answer. You can considerably shorten this using pd.eval
:
df['Val'].replace({'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
0 100
1 9600000
2 54200
3 115300
4 18900
5 176100
6 31600
7 10000
8 3200000
Name: Val, dtype: int64
Slight modification will make this case insensitive as well:
repl_dict = {'[kK]': '*1e3', '[mM]': '*1e6', '[bB]': '*1e9', }
df['Val'].replace(repl_dict, regex=True).map(pd.eval)
0 100.0
1 9600000.0
2 54200.0
3 115300.0
4 18900.0
5 176100.0
6 31600.0
7 10000.0
8 3200000.0
Name: Val, dtype: float64
Explanation
Assuming "Val" is a column of strings, the replace
operation yields,
df['Val'].replace({'K': '*1e3', 'M': '*1e6'}, regex=True)
0 100
1 9.60*1e6
2 54.20*1e3
3 115.30*1e3
4 18.90*1e3
5 176.10*1e3
6 31.60*1e3
7 10.00*1e3
8 3.20*1e6
Name: Val, dtype: object
This is an arithmetic expression which pd.eval
can evaluate!
_ .map(pd.eval)
0 100.0
1 9600000.0
2 54200.0
3 115300.0
4 18900.0
5 176100.0
6 31600.0
7 10000.0
8 3200000.0
Name: Val, dtype: float64
To further generalize cs95’s answer I would do this:
df['Val'].replace({'K': '*1e3', 'M': '*1e6', '-':'-1'}, regex=True).map(pd.eval).astype(int)
since on some numeric values pd.eval has to multiply ‘-‘ by some other number which will result an error. (could not convert string to float ‘-‘)
You can use numerize
library, too easy!
pip install numerize
Use
print(numerize.numerize(1000))
print(numerize.numerize(100000))
print(numerize.numerize(1234567))
print(numerize.numerize(123456789))
Result:
1K
100K
1.23M
123.46M
def value_to_float(x):
try:
x = x.upper()
if 'CEN' in x:
return float(x.replace('CEN', '')) * 10**303
elif 'GO' in x:
return float(x.replace('GO', '')) * 10**100
elif 'QIT' in x:
return float(x.replace('QIT', '')) * 10**84
elif 'QAT' in x:
return float(x.replace('QAT', '')) * 10**45
elif 'TE' in x:
return float(x.replace('TE', '')) * 10**42
elif 'DU' in x:
return float(x.replace('DU', '')) * 10**39
elif 'UN' in x:
return float(x.replace('UN', '')) * 10**36
elif 'DE' in x:
return float(x.replace('DE', '')) * 10**33
elif 'NO' in x:
return float(x.replace('NO', '')) * 10**30
elif 'OC' in x:
return float(x.replace('OC', '')) * 10**27
elif 'SP' in x:
return float(x.replace('SP', '')) * 10**24
elif 'SX' in x:
return float(x.replace('SX', '')) * 10**21
elif 'QI' in x:
return float(x.replace('QI', '')) * 10**18
elif 'QA' in x:
return float(x.replace('QA', '')) * 10**15
elif 'T' in x:
return float(x.replace('T', '')) * 10**12
elif 'B' in x:
return float(x.replace('B', '')) * 10**9
elif 'M' in x:
return float(x.replace('M', '')) * 10**6
elif 'K' in x:
return float(x.replace('K', '')) * 10**3
else:
return float(x)
return 0.0
except Exception:
return 0.0
def float_to_value(x: float):
try:
if x > 10**303-1:
return str(x/10**303) + 'CEN'
elif x > 10**100-1:
return str(x/10**100) + 'GO'
elif x > 10**84-1:
return str(x/10**84) + 'QIT'
elif x > 10**45-1:
return str(x/10**45) + 'QAT'
elif x > 10**42-1:
return str(x/10**42) + 'TE'
elif x > 10**39-1:
return str(x/10**39) + 'DU'
elif x > 10**36-1:
return str(x/10**36) + 'UN'
elif x > 10**33-1:
return str(x/10**33) + 'DE'
elif x > 10**30-1:
return str(x/10**30) + 'NO'
elif x > 10**27-1:
return str(x/10**27) + 'OC'
elif x > 10**24-1:
return str(x/10**24) + 'SP'
elif x > 10**21-1:
return str(x/10**21) + 'SX'
elif x > 10**18-1:
return str(x/10**18) + 'QI'
elif x > 10**15-1:
return str(x/10**15) + 'QA'
elif x > 10**12-1:
return str(x/10**12) + 'T'
elif x > 10**9-1:
return str(x/10**9) + 'B'
elif x > 10**6-1:
return str(x/10**6) + 'M'
elif x > 10**3-1:
return str(x/10**3) + 'K'
else:
return str(x)
except Exception as err:
return '0.0'
df["Val"] = df["Val"].replace(['€','K','M','.'] ,'' , regex=True).astype(int) * df["Val"].replace(['€','.','K','M'] ,['','',1000,1000000] , regex=True).astype(int)