Loading data with DataLoader
We consider the IMDB dataset for this example.
We first define our custom IMDBLoader
component.
Then, we define its associated IMDBLoaderConfig
configuration and perform registrations.
Lastly, we define the runnable script to run our IMDBLoader
and check loaded data.
IMDBLoader
class IMDBLoader(RunnableComponent):
def __init__(
self,
download_directory: Path,
download_filename: str,
dataset_name: str,
download_url: str,
samples_amount: int = -1
):
self.download_directory = download_directory
self.download_filename = download_filename
self.dataset_name = dataset_name
self.download_url = download_url
self.samples_amount = samples_amount
self.download_path = download_directory.joinpath(download_filename)
self.extraction_path = self.download_path.parents[0]
self.dataframe_path = self.extraction_path.joinpath(dataset_name)
def download(
self
):
if not self.download_directory.exists():
self.download_directory.mkdir(parents=True)
# Download
if not self.download_path.exists():
download_url(url=self.download_url, download_path=self.download_path)
# Extract
with tarfile.open(self.download_path) as loaded_tar:
loaded_tar.extractall(self.extraction_path)
# Clean
if self.download_path.exists():
self.download_path.unlink()
def read_df_from_files(
self
) -> pd.DataFrame:
dataframe_rows = []
for split in ['train', 'test']:
for sentiment in ['pos', 'neg']:
folder = self.extraction_path.joinpath('aclImdb', split, sentiment)
for filepath in folder.glob('**/*'):
if not filepath.is_file():
continue
filename = filepath.name
with filepath.open(mode='r', encoding='utf-8') as text_file:
text = text_file.read()
score = filename.split("_")[1].split(".")[0]
file_id = filename.split("_")[0]
# create single dataframe row
dataframe_row = {
"file_id": file_id,
"score": score,
"sentiment": sentiment,
"split": split,
"text": text
}
dataframe_rows.append(dataframe_row)
df = pd.DataFrame(dataframe_rows)
df = df[["file_id",
"score",
"sentiment",
"split",
"text"]]
df = df.rename(columns={'sentiment': 'y', 'text': 'x'})
# Save dataframe for quick retrieval
df.to_csv(path_or_buf=self.dataframe_path, index=None)
return df
def load_data(
self
) -> pd.DataFrame:
if not self.dataframe_path.is_file():
print('First time loading dataset...Downloading...')
self.download()
df = self.read_df_from_files()
else:
if self.dataframe_path.is_file():
print('Loaded pre-loaded dataset...')
df = pd.read_csv(self.dataframe_path)
else:
print("Couldn't find pre-loaded dataset...Building dataset from files...")
df = self.read_df_from_files()
df.to_csv(self.dataframe_path, index=False)
return df
def get_splits(
self,
) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame], Optional[pd.DataFrame]]:
df = self.load_data()
train = df[df.split == 'train'].sample(frac=1).reset_index(drop=True)[:self.samples_amount]
val = None
test = df[df.split == 'test'].sample(frac=1).reset_index(drop=True)[:self.samples_amount]
return train, val, test
def run(
self,
config: Optional[cinnamon.configuration.Configuration] = None
):
return self.load_data()
The IMDBLoader
does the following:
download
: checks if the dataset has to be downloaded from the web. If yes, the loader downloads it and extracts the archive file.read_df_from_files
: an internal utility function that reads extracted files to build apandas.DataFrame
view of the IMDB dataset.load_data
: the API to invoke to obtain thepandas.DataFrame
of the dataset.get_splits
: retrieves the train, validation and test data splits, if available.run
: runsload_data
and returns the resultingpandas.DataFrame
. Therun
method defines the entry point for runningIMDBLoader
via command line.
IMDBLoaderConfig
The IMDBLoader
uses IMDBLoaderConfig
as default configuration template.
class IMDBLoaderConfig(Configuration):
@classmethod
@register_method(name='data_loader',
tags={'imdb'},
namespace='examples',
component_class=IMDBLoader)
def default(
cls
):
config = super().default()
config.add(name='download_directory',
value=Path(__file__).resolve().parent.parent.joinpath('datasets'),
type_hint=Path,
description='Folder the archive file is downloaded')
config.add(name='download_filename',
value='imdb.tar.gz',
type_hint=str,
description='Name of the archive file')
config.add(name='dataset_name',
value='dataset.csv',
type_hint=str,
description='.csv filename')
config.add(name='download_url',
value='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',
type_hint=Union[AnyStr, Path],
description='URL to dataset archive file')
config.add(name='samples_amount',
value=500,
type_hint=int,
description='Number of samples per split to consider at maximum')
return config
Note that we register the default template via RegistrationKey
(name=data_loader
, tags={'imdb'}
, namespace=examples
) and bind it to IMDBLoader
component.
Running IMDBLoader
We can now write a script to test IMDBLoader
.
from pathlib import Path
from cinnamon.registry import Registry
from components.data_loader import IMDBLoader
if __name__ == '__main__':
"""
In this demo script, we retrieve and build our IMDB data loader.
Once built, we run the data loader to load the IMDB dataset and print it for visualization purposes.
"""
directory = Path(__file__).parent.parent.resolve()
Registry.setup(directory=directory)
loader = IMDBLoader.build_component(name='data_loader',
tags={'imdb'},
namespace='examples')
df = loader.load_data()
print(df)
Next!
That’s it! We have defined our data loader component to load and parse the IMDB dataset.
Next, we define data Processor
to further parse our input data for our classifier.