How to merge many data frames to only one
Question:
I have directory that has many folders inside each folder has (images folder & labels text
) and I want to combine them to one dataframe file by concatnating folders name with images name to make them uinque names .
The structuer of my directory like below :
$ tree
.
├── sample
│ ├---- folder_1 ----|-- -- train.jsonl
| | |----- imgs
| | | └───├── 0.png
| | | └── 1.png
| | | └── 2.png
| | | └── 3.png
.. .. ... ...
| | | └── n.png
│ ├---- folder_2 ----|-- -- train.jsonl
| | |----- imgs
| | | └───├── 0.png
| | | └── 1.png
| | | └── 2.png
| | | └── 3.png
.. .. ... ...
| | | └── n.png
│ ├---- folder_3 ----|-- -- train.jsonl
| | |----- imgs
| | | └───├── 0.png
| | | └── 1.png
| | | └── 2.png
| | | └── 3.png
.. .. ... ...
| | | └── n.png
In each folder, the train.jsonl
file contains the image name and the corresponding text for example in folder_1
{"file_name": "0.png", "text": "Hello"}
{"file_name": "1.png", "text": "there"}
In others as well folder_2
:
{"file_name": "0.png", "text": "Hi"}
{"file_name": "1.png", "text": "there from the second dir"}
….
What I want to update file_name
path by reading those json lines with pandas or python and concatenating parent directories with image name :
Now after the update by @RAI
import pandas as pd
import os
df = pd.DataFrame(columns=['file_name', 'text'])
# Traverse the directory recursively
for root, dirs, files in os.walk('sample'):
for file in files:
if file == 'train.jsonl':
df_temp = pd.read_json(os.path.join(root, file), lines=True)
df_temp['file_name'] = os.path.join(root, 'imgs', df_temp['file_name'])
df = df.append(df_temp, ignore_index=True)
print(df)
Get this issue :
Traceback (most recent call last):
File "merage_files.py", line 11, in <module>
print(os.path.join(root, 'imgs', df_temp['file_name']))
File "/usr/lib/python3.8/posixpath.py", line 90, in join
genericpath._check_arg_types('join', a, *p)
File "/usr/lib/python3.8/genericpath.py", line 152, in _check_arg_types
raise TypeError(f'{funcname}() argument must be str, bytes, or '
TypeError: join() argument must be str, bytes, or os.PathLike object, not 'Series'
So the expected df should look like this:
file_name text
0 sample/folder_1/0.png Hello
1 sample/folder_1/1.png there
2 sample/folder_2/0.png Hi
3 sample/folder_2/1.png there from the second dir
.......... ........
To make them unique and we can loop through one data frame file combine all of them
Answers:
import os
import pandas as pd
df = pd.DataFrame(columns=['file_name', 'text'])
for dirpath, dirnames, filenames in os.walk('sample'):
if 'train.jsonl' in filenames:
with open(os.path.join(dirpath, 'train.jsonl'), 'r') as f:
for line in f:
json_data = eval(line)
img_path = os.path.join(dirpath, 'imgs', json_data['file_name'])
df = df.append({'file_name': img_path, 'text': json_data['text']}, ignore_index=True)
print(df)
I have directory that has many folders inside each folder has (images folder & labels text
) and I want to combine them to one dataframe file by concatnating folders name with images name to make them uinque names .
The structuer of my directory like below :
$ tree
.
├── sample
│ ├---- folder_1 ----|-- -- train.jsonl
| | |----- imgs
| | | └───├── 0.png
| | | └── 1.png
| | | └── 2.png
| | | └── 3.png
.. .. ... ...
| | | └── n.png
│ ├---- folder_2 ----|-- -- train.jsonl
| | |----- imgs
| | | └───├── 0.png
| | | └── 1.png
| | | └── 2.png
| | | └── 3.png
.. .. ... ...
| | | └── n.png
│ ├---- folder_3 ----|-- -- train.jsonl
| | |----- imgs
| | | └───├── 0.png
| | | └── 1.png
| | | └── 2.png
| | | └── 3.png
.. .. ... ...
| | | └── n.png
In each folder, the train.jsonl
file contains the image name and the corresponding text for example in folder_1
{"file_name": "0.png", "text": "Hello"}
{"file_name": "1.png", "text": "there"}
In others as well folder_2
:
{"file_name": "0.png", "text": "Hi"}
{"file_name": "1.png", "text": "there from the second dir"}
….
What I want to update file_name
path by reading those json lines with pandas or python and concatenating parent directories with image name :
Now after the update by @RAI
import pandas as pd
import os
df = pd.DataFrame(columns=['file_name', 'text'])
# Traverse the directory recursively
for root, dirs, files in os.walk('sample'):
for file in files:
if file == 'train.jsonl':
df_temp = pd.read_json(os.path.join(root, file), lines=True)
df_temp['file_name'] = os.path.join(root, 'imgs', df_temp['file_name'])
df = df.append(df_temp, ignore_index=True)
print(df)
Get this issue :
Traceback (most recent call last):
File "merage_files.py", line 11, in <module>
print(os.path.join(root, 'imgs', df_temp['file_name']))
File "/usr/lib/python3.8/posixpath.py", line 90, in join
genericpath._check_arg_types('join', a, *p)
File "/usr/lib/python3.8/genericpath.py", line 152, in _check_arg_types
raise TypeError(f'{funcname}() argument must be str, bytes, or '
TypeError: join() argument must be str, bytes, or os.PathLike object, not 'Series'
So the expected df should look like this:
file_name text
0 sample/folder_1/0.png Hello
1 sample/folder_1/1.png there
2 sample/folder_2/0.png Hi
3 sample/folder_2/1.png there from the second dir
.......... ........
To make them unique and we can loop through one data frame file combine all of them
import os
import pandas as pd
df = pd.DataFrame(columns=['file_name', 'text'])
for dirpath, dirnames, filenames in os.walk('sample'):
if 'train.jsonl' in filenames:
with open(os.path.join(dirpath, 'train.jsonl'), 'r') as f:
for line in f:
json_data = eval(line)
img_path = os.path.join(dirpath, 'imgs', json_data['file_name'])
df = df.append({'file_name': img_path, 'text': json_data['text']}, ignore_index=True)
print(df)