! [ -e /content ] && pip install -Uqq fastai # 在Colab上升级fastai视觉数据
from __future__ import annotations
from fastai.torch_basics import *
from fastai.data.all import *
from fastai.vision.core import *
import typesfrom nbdev.showdoc import *
# 来自 fastai 视觉增强库的导入:*用于在视觉应用程序中获取
DataLoaders的辅助函数以及更高级的ImageDataLoaders类
本模块中定义的主要类是 ImageDataLoaders 和 SegmentationDataLoaders,所以你可能想直接查看它们的定义。它们提供的工厂方法是快速准备训练数据的好方法,更多示例请参见 视觉教程。
辅助函数
@delegates(subplots)
def get_grid(
n:int, # 返回网格中的轴数
nrows:int=None, # 返回网格的行数,默认为 `int(math.sqrt(n))`
ncols:int=None, # 返回的网格中的列数,默认为 `ceil(n/rows)`
figsize:tuple=None, # 返回图形的长、宽(英寸)
double:bool=False, # 是否将列数和 `n` 加倍
title:str=None, # 如果通过,标题将设置为该数字。
return_fig:bool=False, # 是否返回由 `subplots` 创建的图形
flatten:bool=True, # 是否将matplot轴展平,以便可以用单个循环对其进行迭代
**kwargs,
) -> (plt.Figure, plt.Axes): # 默认情况下仅返回 `axs`,如果 `return_fig` 设置为 True,则返回 (`fig`, `axs`)。
"Return a grid of `n` axes, `rows` by `cols`"
if nrows:
ncols = ncols or int(np.ceil(n/nrows))
elif ncols:
nrows = nrows or int(np.ceil(n/ncols))
else:
nrows = int(math.sqrt(n))
ncols = int(np.ceil(n/nrows))
if double: ncols*=2 ; n*=2
fig,axs = subplots(nrows, ncols, figsize=figsize, **kwargs)
if flatten: axs = [ax if i<n else ax.set_axis_off() for i, ax in enumerate(axs.flatten())][:n]
if title is not None: fig.suptitle(title, weight='bold', size=14)
return (fig,axs) if return_fig else axs这是由show_batch和show_results的类型分派版本用于视觉应用的。默认的figsize是(cols*imsize, rows*imsize+0.6)。imsize会传递给subplots。suptitle、sharex、sharey、squeeze、subplot_kw和gridspec_kw都被传递给plt.subplots。如果return_fig为True,则返回fig, axs,否则仅返回axs。
def clip_remove_empty(
bbox:TensorBBox, # 边界框的坐标
label:TensorMultiCategory # 边界框的标签
):
"Clip bounding boxes with image border and remove empty boxes along with corresponding labels"
bbox = torch.clamp(bbox, -1, 1)
empty = ((bbox[...,2] - bbox[...,0])*(bbox[...,3] - bbox[...,1]) <= 0.)
return (bbox[~empty], label[TensorBase(~empty)])这是用于 bb_pad 的。
bb = TensorBBox([[-2,-0.5,0.5,1.5], [-0.5,-0.5,0.5,0.5], [1,0.5,0.5,0.75], [-0.5,-0.5,0.5,0.5], [-2, -0.5, -1.5, 0.5]])
bb,lbl = clip_remove_empty(bb, TensorMultiCategory([1,2,3,2,5]))
test_eq(bb, TensorBBox([[-1,-0.5,0.5,1.], [-0.5,-0.5,0.5,0.5], [-0.5,-0.5,0.5,0.5]]))
test_eq(lbl, TensorMultiCategory([1,2,2]))def bb_pad(
samples:list, # 包含三元组(图像、边界框、标签)的列表
pad_idx=0 # 将用于填充每个标签列表的标签
):
"Function that collects `samples` of labelled bboxes and adds padding with `pad_idx`."
samples = [(s[0], *clip_remove_empty(*s[1:])) for s in samples]
max_len = max([len(s[2]) for s in samples])
def _f(img,bbox,lbl):
bbox = torch.cat([bbox,bbox.new_zeros(max_len-bbox.shape[0], 4)])
lbl = torch.cat([lbl, lbl .new_zeros(max_len-lbl .shape[0])+pad_idx])
return img,bbox,lbl
return [_f(*s) for s in samples]这在 BBoxBlock 中使用。
img1,img2 = TensorImage(torch.randn(16,16,3)),TensorImage(torch.randn(16,16,3))
bb1 = tensor([[-2,-0.5,0.5,1.5], [-0.5,-0.5,0.5,0.5], [1,0.5,0.5,0.75], [-0.5,-0.5,0.5,0.5]])
lbl1 = tensor([1, 2, 3, 2])
bb2 = tensor([[-0.5,-0.5,0.5,0.5], [-0.5,-0.5,0.5,0.5]])
lbl2 = tensor([2, 2])
samples = [(img1, bb1, lbl1), (img2, bb2, lbl2)]
res = bb_pad(samples)
non_empty = tensor([True,True,False,True])
test_eq(res[0][0], img1)
test_eq(res[0][1], tensor([[-1,-0.5,0.5,1.], [-0.5,-0.5,0.5,0.5], [-0.5,-0.5,0.5,0.5]]))
test_eq(res[0][2], tensor([1,2,2]))
test_eq(res[1][0], img2)
test_eq(res[1][1], tensor([[-0.5,-0.5,0.5,0.5], [-0.5,-0.5,0.5,0.5], [0,0,0,0]]))
test_eq(res[1][2], tensor([2,2,0])) 显示方法 -
@typedispatch
def show_batch(x:TensorImage, y, samples, ctxs=None, max_n=10, nrows=None, ncols=None, figsize=None, **kwargs):
if ctxs is None: ctxs = get_grid(min(len(samples), max_n), nrows=nrows, ncols=ncols, figsize=figsize)
ctxs = show_batch[object](x, y, samples, ctxs=ctxs, max_n=max_n, **kwargs)
return ctxs@typedispatch
def show_batch(x:TensorImage, y:TensorImage, samples, ctxs=None, max_n=10, nrows=None, ncols=None, figsize=None, **kwargs):
if ctxs is None: ctxs = get_grid(min(len(samples), max_n), nrows=nrows, ncols=ncols, figsize=figsize, double=True)
for i in range(2):
ctxs[i::2] = [b.show(ctx=c, **kwargs) for b,c,_ in zip(samples.itemgot(i),ctxs[i::2],range(max_n))]
return ctxs用于视觉的 TransformBlock
这些是视觉应用为数据块API提供的模块。
def ImageBlock(cls:PILBase=PILImage):
"A `TransformBlock` for images of `cls`"
return TransformBlock(type_tfms=cls.create, batch_tfms=IntToFloatTensor)def MaskBlock(
codes:list=None # 用于分割掩码的词汇标签
):
"A `TransformBlock` for segmentation masks, potentially with `codes`"
return TransformBlock(type_tfms=PILMask.create, item_tfms=AddMaskCodes(codes=codes), batch_tfms=IntToFloatTensor)PointBlock = TransformBlock(type_tfms=TensorPoint.create, item_tfms=PointScaler)
BBoxBlock = TransformBlock(type_tfms=TensorBBox.create, item_tfms=PointScaler, dls_kwargs = {'before_batch': bb_pad})
PointBlock.__doc__ = "A `TransformBlock` for points in an image"
BBoxBlock.__doc__ = "A `TransformBlock` for bounding boxes in an image"show_doc(PointBlock, name='PointBlock')PointBlock
A TransformBlock for points in an image
show_doc(BBoxBlock, name='BBoxBlock')BBoxBlock
A TransformBlock for bounding boxes in an image
def BBoxLblBlock(
vocab:list=None, # 边界框的词汇标签
add_na:bool=True # 将NaN作为背景类添加
):
"A `TransformBlock` for labeled bounding boxes, potentially with `vocab`"
return TransformBlock(type_tfms=MultiCategorize(vocab=vocab, add_na=add_na), item_tfms=BBoxLabeler)如果 add_na 为 True,将为 NaN 添加一个新类别(这将代表背景类)。
图像数据加载器 -
class ImageDataLoaders(DataLoaders):
"Basic wrapper around several `DataLoader`s with factory methods for computer vision problems"
@classmethod
@delegates(DataLoaders.from_dblock)
def from_folder(cls, path, train='train', valid='valid', valid_pct=None, seed=None, vocab=None, item_tfms=None,
batch_tfms=None, img_cls=PILImage, **kwargs):
"Create from imagenet style dataset in `path` with `train` and `valid` subfolders (or provide `valid_pct`)"
splitter = GrandparentSplitter(train_name=train, valid_name=valid) if valid_pct is None else RandomSplitter(valid_pct, seed=seed)
get_items = get_image_files if valid_pct else partial(get_image_files, folders=[train, valid])
dblock = DataBlock(blocks=(ImageBlock(img_cls), CategoryBlock(vocab=vocab)),
get_items=get_items,
splitter=splitter,
get_y=parent_label,
item_tfms=item_tfms,
batch_tfms=batch_tfms)
return cls.from_dblock(dblock, path, path=path, **kwargs)
@classmethod
@delegates(DataLoaders.from_dblock)
def from_path_func(cls, path, fnames, label_func, valid_pct=0.2, seed=None, item_tfms=None, batch_tfms=None,
img_cls=PILImage, **kwargs):
"Create from list of `fnames` in `path`s with `label_func`"
dblock = DataBlock(blocks=(ImageBlock(img_cls), CategoryBlock),
splitter=RandomSplitter(valid_pct, seed=seed),
get_y=label_func,
item_tfms=item_tfms,
batch_tfms=batch_tfms)
return cls.from_dblock(dblock, fnames, path=path, **kwargs)
@classmethod
def from_name_func(cls,
path:str|Path, # 将默认路径设置为一个目录,`Learner` 可以用来保存模型等文件。
fnames:list, # A list of `os.Pathlike`'s to individual image files
label_func:callable, # A function that receives a string (the file name) and outputs a label
**kwargs
) -> DataLoaders:
"Create from the name attrs of `fnames` in `path`s with `label_func`"
if sys.platform == 'win32' and isinstance(label_func, types.LambdaType) and label_func.__name__ == '<lambda>':
# https://medium.com/@jwnx/multiprocessing-serialization-in-python-with-pickle-9844f6fa1812
raise ValueError("label_func couldn't be lambda function on Windows")
f = using_attr(label_func, 'name')
return cls.from_path_func(path, fnames, f, **kwargs)
@classmethod
def from_path_re(cls, path, fnames, pat, **kwargs):
"Create from list of `fnames` in `path`s with re expression `pat`"
return cls.from_path_func(path, fnames, RegexLabeller(pat), **kwargs)
@classmethod
@delegates(DataLoaders.from_dblock)
def from_name_re(cls, path, fnames, pat, **kwargs):
"Create from the name attrs of `fnames` in `path`s with re expression `pat`"
return cls.from_name_func(path, fnames, RegexLabeller(pat), **kwargs)
@classmethod
@delegates(DataLoaders.from_dblock)
def from_df(cls, df, path='.', valid_pct=0.2, seed=None, fn_col=0, folder=None, suff='', label_col=1, label_delim=None,
y_block=None, valid_col=None, item_tfms=None, batch_tfms=None, img_cls=PILImage, **kwargs):
"Create from `df` using `fn_col` and `label_col`"
pref = f'{Path(path) if folder is None else Path(path)/folder}{os.path.sep}'
if y_block is None:
is_multi = (is_listy(label_col) and len(label_col) > 1) or label_delim is not None
y_block = MultiCategoryBlock if is_multi else CategoryBlock
splitter = RandomSplitter(valid_pct, seed=seed) if valid_col is None else ColSplitter(valid_col)
dblock = DataBlock(blocks=(ImageBlock(img_cls), y_block),
get_x=ColReader(fn_col, pref=pref, suff=suff),
get_y=ColReader(label_col, label_delim=label_delim),
splitter=splitter,
item_tfms=item_tfms,
batch_tfms=batch_tfms)
return cls.from_dblock(dblock, df, path=path, **kwargs)
@classmethod
def from_csv(cls, path, csv_fname='labels.csv', header='infer', delimiter=None, quoting=csv.QUOTE_MINIMAL, **kwargs):
"Create from `path/csv_fname` using `fn_col` and `label_col`"
df = pd.read_csv(Path(path)/csv_fname, header=header, delimiter=delimiter, quoting=quoting)
return cls.from_df(df, path=path, **kwargs)
@classmethod
@delegates(DataLoaders.from_dblock)
def from_lists(cls, path, fnames, labels, valid_pct=0.2, seed:int=None, y_block=None, item_tfms=None, batch_tfms=None,
img_cls=PILImage, **kwargs):
"Create from list of `fnames` and `labels` in `path`"
if y_block is None:
y_block = MultiCategoryBlock if is_listy(labels[0]) and len(labels[0]) > 1 else (
RegressionBlock if isinstance(labels[0], float) else CategoryBlock)
dblock = DataBlock.from_columns(blocks=(ImageBlock(img_cls), y_block),
splitter=RandomSplitter(valid_pct, seed=seed),
item_tfms=item_tfms,
batch_tfms=batch_tfms)
return cls.from_dblock(dblock, (fnames, labels), path=path, **kwargs)
ImageDataLoaders.from_csv = delegates(to=ImageDataLoaders.from_df)(ImageDataLoaders.from_csv)
ImageDataLoaders.from_name_func = delegates(to=ImageDataLoaders.from_path_func)(ImageDataLoaders.from_name_func)
ImageDataLoaders.from_path_re = delegates(to=ImageDataLoaders.from_path_func)(ImageDataLoaders.from_path_re)
ImageDataLoaders.from_name_re = delegates(to=ImageDataLoaders.from_name_func)(ImageDataLoaders.from_name_re)这个类不应该直接使用,而应该优先考虑其中一个工厂方法。所有这些工厂方法接受以下作为参数:
item_tfms:在将项目分批之前应用于项目的一个或多个变换batch_tfms:在批次形成后应用于批次的一个或多个变换bs:批次大小val_bs:用于验证DataLoader的批次大小(默认为bs)shuffle_train:是否对训练DataLoader进行洗牌device:要使用的 PyTorch 设备(默认为default_device())
show_doc(ImageDataLoaders.from_folder)ImageDataLoaders.from_folder
ImageDataLoaders.from_folder (path, train='train', valid='valid', valid_pct=None, seed=None, vocab=None, item_tfms=None, batch_tfms=None, img_cls=<class 'fastai.vision.core.PILImage'>, bs:int=64, val_bs:int=None, shuffle:bool=True, device=None)
Create from imagenet style dataset in path with train and valid subfolders (or provide valid_pct)
| Type | Default | Details | |
|---|---|---|---|
| path | str | Path | . | Path to put in DataLoaders |
| train | str | train | |
| valid | str | valid | |
| valid_pct | NoneType | None | |
| seed | NoneType | None | |
| vocab | NoneType | None | |
| item_tfms | NoneType | None | |
| batch_tfms | NoneType | None | |
| img_cls | BypassNewMeta | PILImage | |
| bs | int | 64 | Size of batch |
| val_bs | int | None | Size of batch for validation DataLoader |
| shuffle | bool | True | Whether to shuffle data |
| device | NoneType | None | Device to put DataLoaders |
如果提供了 valid_pct,则会通过设置一定比例的数据用于验证集进行随机拆分(可以选择设置一个 seed)。如果传递了 vocab,则仅保留名称在 vocab 中的文件夹。
以下是加载 MNIST 子样本的示例:
path = untar_data(URLs.MNIST_TINY)
dls = ImageDataLoaders.from_folder(path, img_cls=PILImageBW)x,y = dls.one_batch()
test_eq(x.shape, [64, 1, 28, 28])传递 valid_pct 将忽略有效/训练文件夹并进行新的随机划分:
dls = ImageDataLoaders.from_folder(path, valid_pct=0.2)
dls.valid_ds.items[:3][Path('/home/jhoward/.fastai/data/mnist_tiny/train/7/9307.png'),
Path('/home/jhoward/.fastai/data/mnist_tiny/train/3/8241.png'),
Path('/home/jhoward/.fastai/data/mnist_tiny/valid/3/8924.png')]
show_doc(ImageDataLoaders.from_path_func)ImageDataLoaders.from_path_func
ImageDataLoaders.from_path_func (path, fnames, label_func, valid_pct=0.2, seed=None, item_tfms=None, batch_tfms=None, img_cls=<class 'fastai.vision.core.PILImage'>, bs:int=64, val_bs:int=None, shuffle:bool=True, device=None)
Create from list of fnames in paths with label_func
| Type | Default | Details | |
|---|---|---|---|
| path | str | Path | . | Path to put in DataLoaders |
| fnames | |||
| label_func | |||
| valid_pct | float | 0.2 | |
| seed | NoneType | None | |
| item_tfms | NoneType | None | |
| batch_tfms | NoneType | None | |
| img_cls | BypassNewMeta | PILImage | |
| bs | int | 64 | Size of batch |
| val_bs | int | None | Size of batch for validation DataLoader |
| shuffle | bool | True | Whether to shuffle data |
| device | NoneType | None | Device to put DataLoaders |
验证集是一个随机的 子集,其比例由 valid_pct 决定,可以选择使用 seed 来确保结果的可重复性。
以下是如何在 MNIST 数据集上创建与之前示例相同的 DataLoaders,并使用 label_func:
fnames = get_image_files(path)
def label_func(x): return x.parent.name
dls = ImageDataLoaders.from_path_func(path, fnames, label_func)这是另一个关于宠物数据集的例子。这里的文件名都在一个“images”文件夹中,名称的格式为class_name_123.jpg。正确标记它们的一种方法是将最后一个_之后的所有内容丢弃:
show_doc(ImageDataLoaders.from_path_re)ImageDataLoaders.from_path_re
ImageDataLoaders.from_path_re (path, fnames, pat, valid_pct=0.2, seed=None, item_tfms=None, batch_tfms=None, img_cls=<class 'fastai.vision.core.PILImage'>, bs:int=64, val_bs:int=None, shuffle:bool=True, device=None)
Create from list of fnames in paths with re expression pat
| Type | Default | Details | |
|---|---|---|---|
| path | str | Path | . | Path to put in DataLoaders |
| fnames | |||
| pat | |||
| valid_pct | float | 0.2 | |
| seed | NoneType | None | |
| item_tfms | NoneType | None | |
| batch_tfms | NoneType | None | |
| img_cls | BypassNewMeta | PILImage | |
| bs | int | 64 | Size of batch |
| val_bs | int | None | Size of batch for validation DataLoader |
| shuffle | bool | True | Whether to shuffle data |
| device | NoneType | None | Device to put DataLoaders |
验证集是 valid_pct 的一个随机子集,可选地使用 seed 来实现可重复性。
以下是如何在 MNIST 数据集上创建与之前示例相同的 DataLoaders(在 Windows 上,您需要将前两个 / 更改为 ):
pat = r'/([^/]*)/\d+.png$'
dls = ImageDataLoaders.from_path_re(path, fnames, pat)show_doc(ImageDataLoaders.from_name_func)ImageDataLoaders.from_name_func
ImageDataLoaders.from_name_func (path:str|Path, fnames:list, label_func:callable, valid_pct=0.2, seed=None, item_tfms=None, batch_tfms=None, img_cls=<class 'fastai.vision.core.PILImage'>, bs:int=64, val_bs:int=None, shuffle:bool=True, device=None)
Create from the name attrs of fnames in paths with label_func
| Type | Default | Details | |
|---|---|---|---|
| path | str | Path | Set the default path to a directory that a Learner can use to save files like models |
|
| fnames | list | A list of os.Pathlike’s to individual image files |
|
| label_func | callable | A function that receives a string (the file name) and outputs a label | |
| valid_pct | float | 0.2 | |
| seed | NoneType | None | |
| item_tfms | NoneType | None | |
| batch_tfms | NoneType | None | |
| img_cls | BypassNewMeta | PILImage | |
| bs | int | 64 | Size of batch |
| val_bs | int | None | Size of batch for validation DataLoader |
| shuffle | bool | True | Whether to shuffle data |
| device | NoneType | None | Device to put DataLoaders |
| Returns | DataLoaders |
验证集是一个随机子集,大小为 valid_pct,可以选择使用 seed 来确保可重复性。此方法的功能与 ImageDataLoaders.from_path_func 相同,只是 label_func 应用于每个文件名的名称,而不是完整路径。
show_doc(ImageDataLoaders.from_name_re)ImageDataLoaders.from_name_re
ImageDataLoaders.from_name_re (path, fnames, pat, bs:int=64, val_bs:int=None, shuffle:bool=True, device=None)
Create from the name attrs of fnames in paths with re expression pat
| Type | Default | Details | |
|---|---|---|---|
| path | str | Path | . | Path to put in DataLoaders |
| fnames | |||
| pat | |||
| bs | int | 64 | Size of batch |
| val_bs | int | None | Size of batch for validation DataLoader |
| shuffle | bool | True | Whether to shuffle data |
| device | NoneType | None | Device to put DataLoaders |
验证集是 valid_pct 的随机子集,可以选择使用 seed 来保证可重复性。该方法的功能与 ImageDataLoaders.from_path_re 相同,不同之处在于 pat 应用于每个文件名的名称,而不是完整路径。
show_doc(ImageDataLoaders.from_df)ImageDataLoaders.from_df
ImageDataLoaders.from_df (df, path='.', valid_pct=0.2, seed=None, fn_col=0, folder=None, suff='', label_col=1, label_delim=None, y_block=None, valid_col=None, item_tfms=None, batch_tfms=None, img_cls=<class 'fastai.vision.core.PILImage'>, bs:int=64, val_bs:int=None, shuffle:bool=True, device=None)
Create from df using fn_col and label_col
| Type | Default | Details | |
|---|---|---|---|
| df | |||
| path | str | Path | . | Path to put in DataLoaders |
| valid_pct | float | 0.2 | |
| seed | NoneType | None | |
| fn_col | int | 0 | |
| folder | NoneType | None | |
| suff | str | ||
| label_col | int | 1 | |
| label_delim | NoneType | None | |
| y_block | NoneType | None | |
| valid_col | NoneType | None | |
| item_tfms | NoneType | None | |
| batch_tfms | NoneType | None | |
| img_cls | BypassNewMeta | PILImage | |
| bs | int | 64 | Size of batch |
| val_bs | int | None | Size of batch for validation DataLoader |
| shuffle | bool | True | Whether to shuffle data |
| device | NoneType | None | Device to put DataLoaders |
验证集是valid_pct的随机子集,可选择性地使用seed创建以实现可重复性。或者,如果您的df中包含valid_col,请将其名称或索引提供给该参数(该列应对进入验证集的元素标记为True)。
如果df中的文件名不应直接连接到path,您可以向文件名添加额外的folder。如果它们不包含正确的扩展名,您可以添加suff。如果您的标签列在每一行中包含多个标签,您可以使用label_delim来提醒库您有一个多标签问题。
当库自动选择的任务错误时,应传入y_block,然后您应提供CategoryBlock、MultiCategoryBlock或RegressionBlock。对于更高级的用法,您应使用数据块API。
之前的微型mnist示例也包含了一个数据框版本:
path = untar_data(URLs.MNIST_TINY)
df = pd.read_csv(path/'labels.csv')
df.head()| name | label | |
|---|---|---|
| 0 | train/3/7463.png | 3 |
| 1 | train/3/9829.png | 3 |
| 2 | train/3/7881.png | 3 |
| 3 | train/3/8065.png | 3 |
| 4 | train/3/7046.png | 3 |
以下是如何使用 ImageDataLoaders.from_df 加载它:
dls = ImageDataLoaders.from_df(df, path)/home/jhoward/git/fastai/fastai/data/transforms.py:212: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
o = r[c] if isinstance(c, int) or not c in getattr(r, '_fields', []) else getattr(r, c)
这是另一个多标签问题的示例:
path = untar_data(URLs.PASCAL_2007)
df = pd.read_csv(path/'train.csv')
df.head()| fname | labels | is_valid | |
|---|---|---|---|
| 0 | 000005.jpg | chair | True |
| 1 | 000007.jpg | car | True |
| 2 | 000009.jpg | horse person | True |
| 3 | 000012.jpg | car | False |
| 4 | 000016.jpg | bicycle | True |
dls = ImageDataLoaders.from_df(df, path, folder='train', valid_col='is_valid')/home/jhoward/git/fastai/fastai/data/transforms.py:212: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
o = r[c] if isinstance(c, int) or not c in getattr(r, '_fields', []) else getattr(r, c)
注意,您也可以将 2 传递给 valid_col(索引,起始为 0)。
show_doc(ImageDataLoaders.from_csv)ImageDataLoaders.from_csv
ImageDataLoaders.from_csv (path, csv_fname='labels.csv', header='infer', delimiter=None, valid_pct=0.2, seed=None, fn_col=0, folder=None, suff='', label_col=1, label_delim=None, y_block=None, valid_col=None, item_tfms=None, batch_tfms=None, img_cls=<class 'fastai.vision.core.PILImage'>, bs:int=64, val_bs:int=None, shuffle:bool=True, device=None)
Create from path/csv_fname using fn_col and label_col
| Type | Default | Details | |
|---|---|---|---|
| path | str | Path | . | Path to put in DataLoaders |
| csv_fname | str | labels.csv | |
| header | str | infer | |
| delimiter | NoneType | None | |
| valid_pct | float | 0.2 | |
| seed | NoneType | None | |
| fn_col | int | 0 | |
| folder | NoneType | None | |
| suff | str | ||
| label_col | int | 1 | |
| label_delim | NoneType | None | |
| y_block | NoneType | None | |
| valid_col | NoneType | None | |
| item_tfms | NoneType | None | |
| batch_tfms | NoneType | None | |
| img_cls | BypassNewMeta | PILImage | |
| bs | int | 64 | Size of batch |
| val_bs | int | None | Size of batch for validation DataLoader |
| shuffle | bool | True | Whether to shuffle data |
| device | NoneType | None | Device to put DataLoaders |
与 ImageDataLoaders.from_df 相同,在使用 header 和 delimiter 加载文件后。
以下是使用此方法加载与之前相同的数据集的方式:
dls = ImageDataLoaders.from_csv(path, 'train.csv', folder='train', valid_col='is_valid')/home/jhoward/git/fastai/fastai/data/transforms.py:212: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
o = r[c] if isinstance(c, int) or not c in getattr(r, '_fields', []) else getattr(r, c)
show_doc(ImageDataLoaders.from_lists)ImageDataLoaders.from_lists
ImageDataLoaders.from_lists (path, fnames, labels, valid_pct=0.2, seed:int=None, y_block=None, item_tfms=None, batch_tfms=None, img_cls=<class 'fastai.vision.core.PILImage'>, bs:int=64, val_bs:int=None, shuffle:bool=True, device=None)
Create from list of fnames and labels in path
| Type | Default | Details | |
|---|---|---|---|
| path | str | Path | . | Path to put in DataLoaders |
| fnames | |||
| labels | |||
| valid_pct | float | 0.2 | |
| seed | int | None | |
| y_block | NoneType | None | |
| item_tfms | NoneType | None | |
| batch_tfms | NoneType | None | |
| img_cls | BypassNewMeta | PILImage | |
| bs | int | 64 | Size of batch |
| val_bs | int | None | Size of batch for validation DataLoader |
| shuffle | bool | True | Whether to shuffle data |
| device | NoneType | None | Device to put DataLoaders |
验证集是一个随机子集,大小为valid_pct,可以选择使用seed进行可重复性创建。可以传入y_block来指定目标的类型。
path = untar_data(URLs.PETS)
fnames = get_image_files(path/"images")
labels = ['_'.join(x.name.split('_')[:-1]) for x in fnames]
dls = ImageDataLoaders.from_lists(path, fnames, labels)Downloading a new version of this dataset...
class SegmentationDataLoaders(DataLoaders):
"Basic wrapper around several `DataLoader`s with factory methods for segmentation problems"
@classmethod
@delegates(DataLoaders.from_dblock)
def from_label_func(cls, path, fnames, label_func, valid_pct=0.2, seed=None, codes=None, item_tfms=None, batch_tfms=None,
img_cls=PILImage, **kwargs):
"Create from list of `fnames` in `path`s with `label_func`."
dblock = DataBlock(blocks=(ImageBlock(img_cls), MaskBlock(codes=codes)),
splitter=RandomSplitter(valid_pct, seed=seed),
get_y=label_func,
item_tfms=item_tfms,
batch_tfms=batch_tfms)
res = cls.from_dblock(dblock, fnames, path=path, **kwargs)
return resshow_doc(SegmentationDataLoaders.from_label_func)验证集是 valid_pct 的一个随机子集,可以选择使用 seed 来保证结果可复现。codes 包含标签的映射索引。
path = untar_data(URLs.CAMVID_TINY)
fnames = get_image_files(path/'images')
def label_func(x): return path/'labels'/f'{x.stem}_P{x.suffix}'
codes = np.loadtxt(path/'codes.txt', dtype=str)
dls = SegmentationDataLoaders.from_label_func(path, fnames, label_func, codes=codes)导出 -
from nbdev import nbdev_export
nbdev_export()