# flake8: noqa
from __future__ import annotations
from typing import Any
import numpy as np
import pandas as pd
import torch_frame
from torch_frame.config.text_embedder import TextEmbedderConfig
from torch_frame.config.text_tokenizer import TextTokenizerConfig
from torch_frame.typing import TaskType
from torch_frame.utils import generate_random_split
from torch_frame.utils.split import SPLIT_TO_NUM
SPLIT_COL = 'split'
[docs]class DataFrameTextBenchmark(torch_frame.data.Dataset):
r"""A collection of datasets for tabular learning with text columns,
covering categorical, numerical, multi-categorical and timestamp
features. The datasets are categorized according to their task types
and scales.
Args:
root (str): Root directory.
task_type (TaskType): The task type. Either
:obj:`TaskType.BINARY_CLASSIFICATION`,
:obj:`TaskType.MULTICLASS_CLASSIFICATION`, or
:obj:`TaskType.REGRESSION`
scale (str): The scale of the dataset. :obj:`"small"` means 5K to 50K
rows. :obj:`"medium"` means 50K to 500K rows. :obj:`"large"`
means more than 500K rows.
text_stype (torch_frame.stype): Text stype to use for text columns
in the dataset. (default: :obj:`torch_frame.text_embedded`).
idx (int): The index of the dataset within a category specified via
:obj:`task_type` and :obj:`scale`.
**STATS:**
.. list-table::
:widths: 20 10 10 10 10 10 10 10 20 20 10
:header-rows: 1
* - Task
- Scale
- Idx
- #rows
- #cols (numerical)
- #cols (categorical)
- #cols (text)
- #cols (other)
- #classes
- Class object
- Missing value ratio
* - binary_classification
- small
- 0
- 15,907
- 0
- 3
- 2
- 0
- 2
- MultimodalTextBenchmark(name='fake_job_postings2')
- 23.8%
* - binary_classification
- medium
- 0
- 125,000
- 29
- 0
- 1
- 0
- 2
- MultimodalTextBenchmark(name='jigsaw_unintended_bias100K')
- 41.4%
* - binary_classification
- medium
- 1
- 108,128
- 1
- 3
- 3
- 2
- 2
- MultimodalTextBenchmark(name='kick_starter_funding')
- 0.0%
* - multiclass_classification
- small
- 0
- 6,364
- 0
- 1
- 1
- 0
- 4
- MultimodalTextBenchmark(name='product_sentiment_machine_hack')
- 0.0%
* - multiclass_classification
- small
- 1
- 25,355
- 14
- 0
- 1
- 0
- 6
- MultimodalTextBenchmark(name='news_channel')
- 0.0%
* - multiclass_classification
- small
- 2
- 19,802
- 0
- 3
- 2
- 1
- 6
- MultimodalTextBenchmark(name='data_scientist_salary')
- 12.3%
* - multiclass_classification
- small
- 3
- 22,895
- 26
- 47
- 13
- 3
- 10
- MultimodalTextBenchmark(name='melbourne_airbnb')
- 9.6%
* - multiclass_classification
- medium
- 0
- 105,154
- 2
- 2
- 1
- 0
- 30
- MultimodalTextBenchmark(name='wine_reviews')
- 1.0%
* - multiclass_classification
- medium
- 1
- 114,000
- 11
- 5
- 3
- 0
- 114
- HuggingFaceDatasetDict(path='maharshipandya/spotify-tracks-dataset', target_col='track_genre')
- 0.0%
* - multiclass_classification
- large
- 0
- 568,454
- 2
- 3
- 2
- 0
- 5
- AmazonFineFoodReviews()
- 0.0%
* - regression
- small
- 0
- 6,079
- 0
- 1
- 3
- 0
- 1
- MultimodalTextBenchmark(name='google_qa_answer_type_reason_explanation')
- 0.0%
* - regression
- small
- 1
- 6,079
- 0
- 1
- 3
- 0
- 1
- MultimodalTextBenchmark(name='google_qa_question_type_reason_explanation')
- 0.0%
* - regression
- small
- 2
- 6,237
- 2
- 3
- 3
- 0
- 1
- MultimodalTextBenchmark(name='bookprice_prediction')
- 1.7%
* - regression
- small
- 3
- 13,575
- 2
- 1
- 2
- 0
- 1
- MultimodalTextBenchmark(name='jc_penney_products')
- 13.7%
* - regression
- small
- 4
- 23,486
- 1
- 3
- 2
- 0
- 1
- MultimodalTextBenchmark(name='women_clothing_review')
- 1.8%
* - regression
- small
- 5
- 30,009
- 3
- 0
- 1
- 0
- 1
- MultimodalTextBenchmark(name='news_popularity2')
- 0.0%
* - regression
- small
- 6
- 28,328
- 2
- 5
- 1
- 3
- 1
- MultimodalTextBenchmark(name='ae_price_prediction')
- 6.1%
* - regression
- small
- 7
- 47,439
- 18
- 8
- 2
- 11
- 1
- MultimodalTextBenchmark(name='california_house_price')
- 13.8%
* - regression
- medium
- 0
- 125,000
- 0
- 6
- 2
- 1
- 1
- MultimodalTextBenchmark(name='mercari_price_suggestion100K')
- 3.4%
* - regression
- large
- 0
- 1,482,535
- 1
- 4
- 2
- 1
- 1
- Mercari()
- 0.0%
"""
dataset_categorization_dict: dict[str, dict[str, list[tuple]]] = {
'binary_classification': {
'small': [
('MultimodalTextBenchmark', {
'name': 'fake_job_postings2'
}),
],
'medium': [
('MultimodalTextBenchmark', {
'name': 'jigsaw_unintended_bias100K'
}),
('MultimodalTextBenchmark', {
'name': 'kick_starter_funding'
}),
],
'large': [],
},
'multiclass_classification': {
'small': [
('MultimodalTextBenchmark', {
'name': 'product_sentiment_machine_hack'
}),
('MultimodalTextBenchmark', {
'name': 'news_channel'
}),
('MultimodalTextBenchmark', {
'name': 'data_scientist_salary'
}),
('MultimodalTextBenchmark', {
'name': 'melbourne_airbnb'
}),
],
'medium': [
('MultimodalTextBenchmark', {
'name': 'wine_reviews'
}),
('HuggingFaceDatasetDict', {
'path':
'maharshipandya/spotify-tracks-dataset',
'columns': [
'artists', 'album_name', 'track_name', 'popularity',
'duration_ms', 'explicit', 'danceability', 'energy',
'key', 'loudness', 'mode', 'speechiness',
'acousticness', 'instrumentalness', 'liveness',
'valence', 'tempo', 'time_signature', 'track_genre'
],
'target_col':
'track_genre',
}),
],
'large': [
('AmazonFineFoodReviews', {}),
],
},
'regression': {
'small': [
('MultimodalTextBenchmark', {
'name': 'google_qa_answer_type_reason_explanation'
}),
('MultimodalTextBenchmark', {
'name': 'google_qa_question_type_reason_explanation'
}),
('MultimodalTextBenchmark', {
'name': 'bookprice_prediction'
}),
('MultimodalTextBenchmark', {
'name': 'jc_penney_products'
}),
('MultimodalTextBenchmark', {
'name': 'women_clothing_review'
}),
('MultimodalTextBenchmark', {
'name': 'news_popularity2'
}),
('MultimodalTextBenchmark', {
'name': 'ae_price_prediction'
}),
('MultimodalTextBenchmark', {
'name': 'california_house_price'
}),
],
'medium': [
('MultimodalTextBenchmark', {
'name': 'mercari_price_suggestion100K'
}),
],
'large': [
('Mercari', {}),
],
}
}
[docs] @classmethod
def datasets_available(
cls,
task_type: TaskType,
scale: str,
) -> list[tuple[str, dict[str, Any]]]:
r"""List of datasets available for a given :obj:`task_type` and
:obj:`scale`.
"""
return cls.dataset_categorization_dict[task_type.value][scale]
[docs] @classmethod
def num_datasets_available(cls, task_type: TaskType, scale: str):
r"""Number of datasets available for a given :obj:`task_type` and
:obj:`scale`.
"""
return len(cls.datasets_available(task_type, scale))
def __init__(
self,
root: str,
task_type: TaskType,
scale: str,
idx: int,
text_stype: torch_frame.stype = torch_frame.text_embedded,
col_to_text_embedder_cfg: dict[str, TextEmbedderConfig]
| TextEmbedderConfig | None = None,
col_to_text_tokenizer_cfg: dict[str, TextTokenizerConfig]
| TextTokenizerConfig | None = None,
split_random_state: int = 42,
):
self.root = root
self._task_type = task_type
self.scale = scale
self.idx = idx
datasets = self.datasets_available(task_type, scale)
if idx >= len(datasets):
raise ValueError(
f"The idx needs to be smaller than {len(datasets)}, which is "
f"the number of available datasets for task_type: "
f"{task_type.value} and scale: {scale} (got idx: {idx}).")
class_name, kwargs = self.datasets_available(task_type, scale)[idx]
if class_name in {'MultimodalTextBenchmark', 'AmazonFineFoodReviews'}:
text_args = dict(
text_stype=text_stype,
col_to_text_embedder_cfg=col_to_text_embedder_cfg,
col_to_text_tokenizer_cfg=col_to_text_tokenizer_cfg)
elif class_name == 'HuggingFaceDatasetDict':
# TODO (zecheng): support text tokenized
text_args = dict(col_to_text_embedder_cfg=col_to_text_embedder_cfg)
else:
text_args = dict(col_to_text_embedder_cfg=col_to_text_embedder_cfg)
if class_name == 'HuggingFaceDatasetDict':
# HuggingFace will handle the data download so don't need the root
dataset = getattr(torch_frame.datasets, class_name)(**text_args,
**kwargs)
else:
dataset = getattr(torch_frame.datasets,
class_name)(root=root, **text_args, **kwargs)
self.cls_str = str(dataset)
# Add split col
df = dataset.df
# Follow default split for datasets of MultimodalTextBenchmark:
if class_name == 'MultimodalTextBenchmark':
df = df.sort_values(by=[SPLIT_COL])
num_unique = df[SPLIT_COL].nunique()
assert num_unique > 1
# Manually split validation set from the train one:
if num_unique == 2:
ser = df[SPLIT_COL]
train_ser = ser[ser == SPLIT_TO_NUM['train']]
split_ser = generate_random_split(length=len(train_ser),
seed=split_random_state,
train_ratio=0.9,
val_ratio=0.1,
include_test=False)
split_ser = np.concatenate([
split_ser,
np.full(len(df) - len(split_ser), SPLIT_TO_NUM['test'])
])
df[SPLIT_COL] = split_ser
else:
if SPLIT_COL in df.columns:
df.drop(columns=[SPLIT_COL], inplace=True)
split_df = pd.DataFrame({
SPLIT_COL:
generate_random_split(length=len(df), seed=split_random_state,
train_ratio=0.8, val_ratio=0.1)
})
df = pd.concat([df, split_df], axis=1)
# For regression task, we normalize the target.
if task_type == TaskType.REGRESSION:
ser = df[dataset.target_col]
df[dataset.target_col] = (ser - ser.mean()) / ser.std()
# Check the scale
if dataset.num_rows < 5000:
assert False
elif dataset.num_rows < 50000:
assert scale == "small"
elif dataset.num_rows < 500000:
assert scale == "medium"
else:
assert scale == "large"
super().__init__(
df,
dataset.col_to_stype,
target_col=dataset.target_col,
split_col=SPLIT_COL,
col_to_sep=dataset.col_to_sep,
col_to_text_embedder_cfg=dataset.col_to_text_embedder_cfg,
col_to_text_tokenizer_cfg=dataset.col_to_text_tokenizer_cfg,
)
del dataset
def __repr__(self) -> str:
return (f'{self.__class__.__name__}(\n'
f' task_type={self._task_type.value},\n'
f' scale={self.scale},\n'
f' idx={self.idx},\n'
f' cls={self.cls_str}\n'
f')')
[docs] def materialize(self, *args, **kwargs) -> torch_frame.data.Dataset:
super().materialize(*args, **kwargs)
if self.task_type != self._task_type:
raise RuntimeError(f"task type does not match. It should be "
f"{self.task_type.value} but specified as "
f"{self._task_type.value}.")
return self