Loading data with DataLoader
We consider the IMDB dataset for this example.
We first define our custom IMDBLoader component.
Then, we define its associated IMDBLoaderConfig configuration and perform registrations.
Lastly, we define the runnable script to run our IMDBLoader and check loaded data.
IMDBLoader
class IMDBLoader(RunnableComponent):
def __init__(
self,
download_directory: Path,
download_filename: str,
dataset_name: str,
download_url: str,
samples_amount: int = -1
):
self.download_directory = download_directory
self.download_filename = download_filename
self.dataset_name = dataset_name
self.download_url = download_url
self.samples_amount = samples_amount
self.download_path = download_directory.joinpath(download_filename)
self.extraction_path = self.download_path.parents[0]
self.dataframe_path = self.extraction_path.joinpath(dataset_name)
def download(
self
):
if not self.download_directory.exists():
self.download_directory.mkdir(parents=True)
# Download
if not self.download_path.exists():
download_url(url=self.download_url, download_path=self.download_path)
# Extract
with tarfile.open(self.download_path) as loaded_tar:
loaded_tar.extractall(self.extraction_path)
# Clean
if self.download_path.exists():
self.download_path.unlink()
def read_df_from_files(
self
) -> pd.DataFrame:
dataframe_rows = []
for split in ['train', 'test']:
for sentiment in ['pos', 'neg']:
folder = self.extraction_path.joinpath('aclImdb', split, sentiment)
for filepath in folder.glob('**/*'):
if not filepath.is_file():
continue
filename = filepath.name
with filepath.open(mode='r', encoding='utf-8') as text_file:
text = text_file.read()
score = filename.split("_")[1].split(".")[0]
file_id = filename.split("_")[0]
# create single dataframe row
dataframe_row = {
"file_id": file_id,
"score": score,
"sentiment": sentiment,
"split": split,
"text": text
}
dataframe_rows.append(dataframe_row)
df = pd.DataFrame(dataframe_rows)
df = df[["file_id",
"score",
"sentiment",
"split",
"text"]]
df = df.rename(columns={'sentiment': 'y', 'text': 'x'})
# Save dataframe for quick retrieval
df.to_csv(path_or_buf=self.dataframe_path, index=None)
return df
def load_data(
self
) -> pd.DataFrame:
if not self.dataframe_path.is_file():
print('First time loading dataset...Downloading...')
self.download()
df = self.read_df_from_files()
else:
if self.dataframe_path.is_file():
print('Loaded pre-loaded dataset...')
df = pd.read_csv(self.dataframe_path)
else:
print("Couldn't find pre-loaded dataset...Building dataset from files...")
df = self.read_df_from_files()
df.to_csv(self.dataframe_path, index=False)
return df
def get_splits(
self,
) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame], Optional[pd.DataFrame]]:
df = self.load_data()
train = df[df.split == 'train'].sample(frac=1).reset_index(drop=True)[:self.samples_amount]
val = None
test = df[df.split == 'test'].sample(frac=1).reset_index(drop=True)[:self.samples_amount]
return train, val, test
def run(
self,
config: Optional[cinnamon.configuration.Configuration] = None
):
return self.load_data()
The IMDBLoader does the following:
download: checks if the dataset has to be downloaded from the web. If yes, the loader downloads it and extracts the archive file.read_df_from_files: an internal utility function that reads extracted files to build apandas.DataFrameview of the IMDB dataset.load_data: the API to invoke to obtain thepandas.DataFrameof the dataset.get_splits: retrieves the train, validation and test data splits, if available.run: runsload_dataand returns the resultingpandas.DataFrame. Therunmethod defines the entry point for runningIMDBLoadervia command line.
IMDBLoaderConfig
The IMDBLoader uses IMDBLoaderConfig as default configuration template.
class IMDBLoaderConfig(Configuration):
@classmethod
@register_method(name='data_loader',
tags={'imdb'},
namespace='examples',
component_class=IMDBLoader)
def default(
cls
):
config = super().default()
config.add(name='download_directory',
value=Path(__file__).resolve().parent.parent.joinpath('datasets'),
type_hint=Path,
description='Folder the archive file is downloaded')
config.add(name='download_filename',
value='imdb.tar.gz',
type_hint=str,
description='Name of the archive file')
config.add(name='dataset_name',
value='dataset.csv',
type_hint=str,
description='.csv filename')
config.add(name='download_url',
value='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',
type_hint=Union[AnyStr, Path],
description='URL to dataset archive file')
config.add(name='samples_amount',
value=500,
type_hint=int,
description='Number of samples per split to consider at maximum')
return config
Note that we register the default template via RegistrationKey (name=data_loader, tags={'imdb'}, namespace=examples) and bind it to IMDBLoader component.
Running IMDBLoader
We can now write a script to test IMDBLoader.
from pathlib import Path
from cinnamon.registry import Registry
from components.data_loader import IMDBLoader
if __name__ == '__main__':
"""
In this demo script, we retrieve and build our IMDB data loader.
Once built, we run the data loader to load the IMDB dataset and print it for visualization purposes.
"""
directory = Path(__file__).parent.parent.resolve()
Registry.setup(directory=directory)
loader = IMDBLoader.build_component(name='data_loader',
tags={'imdb'},
namespace='examples')
df = loader.load_data()
print(df)
Next!
That’s it! We have defined our data loader component to load and parse the IMDB dataset.
Next, we define data Processor to further parse our input data for our classifier.