mamkit.data package#

Submodules#

mamkit.data.collators module#

class mamkit.data.collators.AudioCollator#

Bases: object

class mamkit.data.collators.AudioTransformerCollator(model_card, sampling_rate, downsampling_factor=None, aggregate=False, processor_args=None, model_args=None)#

Bases: object

class mamkit.data.collators.MultimodalCollator(text_collator, audio_collator, label_collator)#

Bases: object

class mamkit.data.collators.PairAudioCollator#

Bases: AudioCollator

class mamkit.data.collators.PairMultimodalCollator(text_collator, audio_collator, label_collator)#

Bases: MultimodalCollator

class mamkit.data.collators.PairTextCollator(tokenizer, vocab)#

Bases: TextCollator

class mamkit.data.collators.PairTextTransformerCollator(model_card, tokenizer_args=None)#

Bases: TextTransformerCollator

class mamkit.data.collators.PairTextTransformerOutputCollator#

Bases: TextTransformerOutputCollator

class mamkit.data.collators.PairUnimodalCollator(features_collator, label_collator)#

Bases: UnimodalCollator

class mamkit.data.collators.TextCollator(tokenizer, vocab)#

Bases: object

class mamkit.data.collators.TextTransformerCollator(model_card, tokenizer_args=None)#

Bases: object

class mamkit.data.collators.TextTransformerOutputCollator#

Bases: object

class mamkit.data.collators.UnimodalCollator(features_collator, label_collator)#

Bases: object

mamkit.data.datasets module#

class mamkit.data.datasets.InputMode(value)#

Bases: Enum

Enum class for the input modes of the dataset.

TEXT_ONLY: only text data AUDIO_ONLY: only audio data TEXT_AUDIO: both text and audio data

AUDIO_ONLY = 'audio-only'#
TEXT_AUDIO = 'text-audio'#
TEXT_ONLY = 'text-only'#
class mamkit.data.datasets.Loader(task_name, input_mode, base_data_path=None)#

Bases: ABC

add_splits(method, key)#
build_info_from_splits(train_df, val_df, test_df)#
Return type:

SplitInfo

abstract property data: DataFrame#
Return type:

DataFrame

abstract get_default_splits(as_iterator=False)#
Return type:

Union[List[SplitInfo], SplitInfo]

get_splits(key='default')#
Return type:

List[SplitInfo]

class mamkit.data.datasets.MArg(confidence, **kwargs)#

Bases: Loader

build()#
build_chunks()#
property data: DataFrame#
Return type:

DataFrame

get_default_splits(as_iterator=False)#
Return type:

Union[List[SplitInfo], SplitInfo]

get_mancini_2022_splits()#
Return type:

List[SplitInfo]

class mamkit.data.datasets.MMUSED(**kwargs)#

Bases: Loader

build_audio()#
build_from_scratch()#
copy_final_csv()#
Returns:

None. The function copies the generated dataset into the official ‘MM-USElecDeb60to16’ folder, renaming the file to ‘MM-USElecDeb60to16.csv’

copy_transcripts(debate_ids)#
Parameters:

debate_ids (List) – list of strings representing debates IDs

Returns:

None. The function copies transcripts from the original dataset folder to the ‘files/transcripts’ folder.

create_plain_text(debate_ids)#
Parameters:

debate_ids (List) – list of strings representing debates IDs

Return type:

None

Returns:

None. The function creates the plain version of each transcript, saving a new version ‘_plain.txt’ in the subdirectory of the corresponding debate. The function creates the plain version of each transcript, saving a new version ‘_plain.txt’ in the subdirectory of the corresponding debate. In the plain version, speaker information is removed and the text is tokenized by sentences. The plain text thus contains one sentence per line.

property data: DataFrame#
Return type:

DataFrame

generate_chunks(debate_ids)#
Parameters:

debate_ids (List) – list of strings representing debates IDs

Return type:

None

Returns:

None. The function generates the 20-minute chunks for each debate and saves them in the ‘split’ sub-folders of each debate in ‘files/debates_audio_recordings’

generate_clips(debate_ids)#
Parameters:

debate_ids (List) – list of strings representing debates IDs

Returns:

None. The function generates, for each debate, the audio clips corresponding to each sentence in the dataset. The audio files are saved in ‘files/audio_clips’ in subfolders corresponding to each debate. For each debate it creates a new dataset in which the column corresponding to the debate_ids of the clips is filled with the debate_ids of the corresponding generated clip.

generate_dataset(debate_ids)#
Parameters:

debate_ids (List) – list of strings representing debates IDs

Returns:

None. The function generates a new dataset ‘.csv’ for each debate from the original dataset. Each new dataset contains 3 new columns corresponding to the new start and end timestamps calculated through the alignment with ‘aeneas’ and the debate_ids of the clip corresponding to each sentence. The function also saves a ‘duplicates.txt’ file for each debate, containing the duplicated sentences and the number of occurrences.

generate_empty_transcript_files(debate_ids)#
Parameters:

debate_ids (List) – list of strings representing debates IDs

Return type:

None

Returns:

None. The function generates as many empty ‘.txt’ files as there are chunks generated for each debate and saves them in the ‘splits’ subdirectory of each debate in the ‘files/transcripts’ folder

get_default_splits(as_iterator=False)#
Return type:

Union[List[SplitInfo], SplitInfo]

load()#
remove_duplicates(debate_ids)#
Parameters:

debate_ids (List) – list of strings representing debates IDs

Returns:

None. The function removes duplicates in the dataset

remove_not_found(debate_ids)#
Parameters:

debate_ids (List) – list of strings representing debates IDs

Returns:

None. The function removes samples marked ‘NOT_FOUND’, i.e. sentences for which a match with the alignment results was not found.

run_aeneas(debate_ids)#
Parameters:

debate_ids (List) – list of strings representing debates IDs

Return type:

None

Returns:

None. For each debate it executes the script to perform the alignment of audio and text. The ‘.json’ files resulting from the alignment come in ‘files/alignment_results’. A subfolder for each debate.

trim_audio(debate_ids, start_min, start_sec, end_min, end_sec)#
Parameters:
  • debate_ids (List) – list of strings representing debates IDs

  • start_min (List) – list of strings representing the number of minutes to be cut from the beginning of the file

  • start_sec (List) – list of strings representing the number of seconds to be cut from the beginning of the file

  • end_min (List) – list of strings representing the number of minutes to be cut from the end of the file

  • end_sec (List) – list of strings representing the number of seconds to be cut from the end of the file

Return None:

None. The function removes from the original audio file the portions of audio corresponding to the specified seconds and minutes and saves a new version of the file ‘_trim.wav’ in ‘files/debates_audio_recordings’ (in the corresponding debate’s sub folder).

Return type:

None

unify_datasets_debates(debate_ids)#
Parameters:

debate_ids (List) – list of strings representing debates IDs

Returns:

None. The function combines the datasets created for each debate to create the new dataset MM-ElecDeb60to16

class mamkit.data.datasets.MMUSEDFallacy(sample_rate=16000, clip_modality='full', n_files=None, **kwargs)#

Bases: Loader

build_clips()#
property data: DataFrame#
Return type:

DataFrame

generate_clips(element, ids, dataset_path)#
get_default_splits(as_iterator=False)#
Return type:

Union[List[SplitInfo], SplitInfo]

get_mancini_2024_splits()#
Return type:

List[SplitInfo]

load()#
class mamkit.data.datasets.MultimodalDataset(texts, audio, labels)#

Bases: Dataset

class mamkit.data.datasets.PairMultimodalDataset(a_texts, b_texts, a_audio, b_audio, labels)#

Bases: Dataset

class mamkit.data.datasets.PairUnimodalDataset(a_inputs, b_inputs, labels)#

Bases: Dataset

class mamkit.data.datasets.SplitInfo(train, val, test)#

Bases: object

test: UnimodalDataset | PairUnimodalDataset | MultimodalDataset | PairMultimodalDataset | None#
train: UnimodalDataset | PairUnimodalDataset | MultimodalDataset | PairMultimodalDataset#
val: UnimodalDataset | PairUnimodalDataset | MultimodalDataset | PairMultimodalDataset | None#
class mamkit.data.datasets.UKDebates(**kwargs)#

Bases: Loader

property data: DataFrame#
Return type:

DataFrame

get_default_splits(as_iterator=False)#
Return type:

Union[List[SplitInfo], SplitInfo]

get_mancini_2022_splits()#
Return type:

List[SplitInfo]

load()#
parse_all_annotations()#
parse_speaker_annotations(speaker)#
class mamkit.data.datasets.UnimodalDataset(inputs, labels)#

Bases: Dataset

Dataset class for unimodal data.

__getitem__(idx)#

Get item method.

Parameters:

idx – index of the item to retrieve

Returns:

input, label

Return type:

tuple

mamkit.data.processing module#

class mamkit.data.processing.AudioTransformer(model_card, sampling_rate, downsampling_factor=None, aggregate=False, processor_args=None, model_args=None)#

Bases: ProcessorComponent

clear()#
class mamkit.data.processing.AudioTransformerExtractor(model_card, sampling_rate, downsampling_factor=None, aggregate=False, processor_args=None, model_args=None)#

Bases: ProcessorComponent

clear()#
class mamkit.data.processing.MFCCExtractor(mfccs, sampling_rate=16000, pooling_sizes=None, remove_energy=True, normalize=True, serialization_path=None)#

Bases: ProcessorComponent

parse_audio(audio_file)#
class mamkit.data.processing.MultimodalProcessor(text_processor=None, audio_processor=None, label_processor=None)#

Bases: Processor

clear()#
fit(train_data)#
class mamkit.data.processing.PairAudioTransformer(model_card, sampling_rate, downsampling_factor=None, aggregate=False, processor_args=None, model_args=None)#

Bases: ProcessorComponent

clear()#
class mamkit.data.processing.PairAudioTransformerExtractor(model_card, sampling_rate, downsampling_factor=None, aggregate=False, processor_args=None, model_args=None)#

Bases: ProcessorComponent

clear()#
class mamkit.data.processing.PairMFCCExtractor(mfccs, sampling_rate=16000, pooling_sizes=None, remove_energy=True, normalize=True, serialization_path=None)#

Bases: ProcessorComponent

parse_audio(audio_file)#
class mamkit.data.processing.PairMultimodalProcessor(text_processor=None, audio_processor=None, label_processor=None)#

Bases: Processor

clear()#
fit(train_data)#
class mamkit.data.processing.PairTextTransformer(model_card, tokenizer_args=None, model_args=None)#

Bases: ProcessorComponent

clear()#
class mamkit.data.processing.PairUnimodalProcessor(features_processor=None, label_processor=None)#

Bases: Processor

clear()#
fit(train_data)#
class mamkit.data.processing.PairVocabBuilder(tokenizer, embedding_dim, embedding_model=None, tokenization_args=None)#

Bases: ProcessorComponent

clear()#
fit(a_texts, b_texts)#
reset()#
class mamkit.data.processing.Processor#

Bases: object

clear()#
fit(train_data)#
reset()#
class mamkit.data.processing.ProcessorComponent#

Bases: object

clear()#
fit(*args, **kwargs)#
reset()#
class mamkit.data.processing.TextTransformer(model_card, tokenizer_args=None, model_args=None)#

Bases: ProcessorComponent

clear()#
class mamkit.data.processing.UnimodalProcessor(features_processor=None, label_processor=None)#

Bases: Processor

clear()#
fit(train_data)#
class mamkit.data.processing.VocabBuilder(tokenizer, embedding_dim, embedding_model=None, tokenization_args=None)#

Bases: ProcessorComponent

clear()#
fit(texts)#
reset()#

Module contents#