how to create a dataset from three files on disk with datasets library in python?
Question:
I have three files under the name train.xlsx and validation.xlsx and test.xlsx on disk.
And I need to have a dataset with datasets library with these three files.
Here is my code:
from google.colab import drive
from datasets import Dataset
import pandas as pd
drive.mount('/content/drive')
train_data = pd.read_excel('/content/drive/My Drive/NLP-Datasets/Question2_Data/train.xlsx')
validation_data = pd.read_excel('/content/drive/My Drive/NLP-Datasets/Question2_Data/valid.xlsx')
test_data = pd.read_excel('/content/drive/My Drive/NLP-Datasets/Question2_Data/test.xlsx')
print(train_data.shape)
print(validation_data.shape)
print(test_data.shape)
Now I need to have a dataset with these keys from corresponding files:
dataset[‘train’] and dataset[‘validation’] and dataset[‘test’]
Could anyone help me?
Answers:
Try this
train_data = train_data.values.tolist()
validation_data = validation_data.values.tolist()
test_data = test_data.values.tolist()
d = {'train_data ' : train_data ,
'validation_data ' : validation_data ,
'test_data ' : test_data
}
df = pd.DataFrame(data = d)
It is worth noting that .values.tolist()
works if these dataframes have one column, if there isn’t one, specify it EX.: train_data ['COLUMN'].values.tolist()
Try this out
import pandas as pd
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/NLP-Datasets/Question2_Data/')
train_data = 'train.xlsx'
validation_data = 'valid.xlsx'
test_data = 'test.xlsx'
paths = [train_data, validation_data , test_data ]
dfs = {p: pd.read_excel(p) for p in paths}
Update:
You can create a dataset from three files on disk using the datasets library in Python as follows:
from google.colab import drive
from datasets import Dataset
import pandas as pd
# Mount Google Drive
drive.mount('/content/drive')
# Load train, validation, and test data
train_data = pd.read_excel('/content/drive/My Drive/NLP-Datasets/Question2_Data/train.xlsx')
validation_data = pd.read_excel('/content/drive/My Drive/NLP-Datasets/Question2_Data/valid.xlsx')
test_data = pd.read_excel('/content/drive/My Drive/NLP-Datasets/Question2_Data/test.xlsx')
# Convert data to dictionary format
train_dict = train_data.to_dict(orient='list')
validation_dict = validation_data.to_dict(orient='list')
test_dict = test_data.to_dict(orient='list')
# Create a dataset from the data
dataset = Dataset.from_dict({
'train': train_dict,
'validation': validation_dict,
'test': test_dict
})
# Print the shapes of the data
print(dataset['train'].shape)
print(dataset['validation'].shape)
print(dataset['test'].shape)
I have three files under the name train.xlsx and validation.xlsx and test.xlsx on disk.
And I need to have a dataset with datasets library with these three files.
Here is my code:
from google.colab import drive
from datasets import Dataset
import pandas as pd
drive.mount('/content/drive')
train_data = pd.read_excel('/content/drive/My Drive/NLP-Datasets/Question2_Data/train.xlsx')
validation_data = pd.read_excel('/content/drive/My Drive/NLP-Datasets/Question2_Data/valid.xlsx')
test_data = pd.read_excel('/content/drive/My Drive/NLP-Datasets/Question2_Data/test.xlsx')
print(train_data.shape)
print(validation_data.shape)
print(test_data.shape)
Now I need to have a dataset with these keys from corresponding files:
dataset[‘train’] and dataset[‘validation’] and dataset[‘test’]
Could anyone help me?
Try this
train_data = train_data.values.tolist()
validation_data = validation_data.values.tolist()
test_data = test_data.values.tolist()
d = {'train_data ' : train_data ,
'validation_data ' : validation_data ,
'test_data ' : test_data
}
df = pd.DataFrame(data = d)
It is worth noting that .values.tolist()
works if these dataframes have one column, if there isn’t one, specify it EX.: train_data ['COLUMN'].values.tolist()
Try this out
import pandas as pd
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/NLP-Datasets/Question2_Data/')
train_data = 'train.xlsx'
validation_data = 'valid.xlsx'
test_data = 'test.xlsx'
paths = [train_data, validation_data , test_data ]
dfs = {p: pd.read_excel(p) for p in paths}
Update:
You can create a dataset from three files on disk using the datasets library in Python as follows:
from google.colab import drive
from datasets import Dataset
import pandas as pd
# Mount Google Drive
drive.mount('/content/drive')
# Load train, validation, and test data
train_data = pd.read_excel('/content/drive/My Drive/NLP-Datasets/Question2_Data/train.xlsx')
validation_data = pd.read_excel('/content/drive/My Drive/NLP-Datasets/Question2_Data/valid.xlsx')
test_data = pd.read_excel('/content/drive/My Drive/NLP-Datasets/Question2_Data/test.xlsx')
# Convert data to dictionary format
train_dict = train_data.to_dict(orient='list')
validation_dict = validation_data.to_dict(orient='list')
test_dict = test_data.to_dict(orient='list')
# Create a dataset from the data
dataset = Dataset.from_dict({
'train': train_dict,
'validation': validation_dict,
'test': test_dict
})
# Print the shapes of the data
print(dataset['train'].shape)
print(dataset['validation'].shape)
print(dataset['test'].shape)