oneflow.data

class oneflow.data.BlobConf(name: str, shape: Sequence[int], dtype: oneflow._oneflow_internal.dtype, codec: Union[oneflow.python.ops.data_ops.ImageCodec, oneflow.python.ops.data_ops.RawCodec], preprocessors: Optional[Sequence[oneflow.python.ops.data_ops.NormByChannelPreprocessor]] = None)
decode_blob(input_blob: oneflow._oneflow_internal.BlobDesc, batch_size: int) → oneflow._oneflow_internal.BlobDesc
class oneflow.data.ImageCodec(image_preprocessors: Optional[Sequence[Union[oneflow.python.ops.data_ops.ImagePreprocessor, oneflow.python.ops.data_ops.ImageResizePreprocessor]]] = None)
color_space() → str
do_mirror() → bool
do_resize()
oneflow.data.ImageDecoderRandomCropResize(input_blob: oneflow._oneflow_internal.BlobDesc, target_width: int, target_height: int, num_attempts: Optional[int] = None, seed: Optional[int] = None, random_area: Optional[Sequence[float]] = None, random_aspect_ratio: Optional[Sequence[float]] = None, num_workers: Optional[int] = None, warmup_size: Optional[int] = None, max_num_pixels: Optional[int] = None, name: Optional[str] = None) → Tuple[oneflow._oneflow_internal.BlobDesc]
class oneflow.data.ImagePreprocessor(preprocessor: str)
is_mirror() → bool
is_rgb() → bool
class oneflow.data.ImageResizePreprocessor(width: int, height: int)
oneflow.data.MegatronGPTMMapDataLoader(data_file_prefix: str, seq_length: int, num_samples: int, batch_size: int, dtype: oneflow._oneflow_internal.dtype = oneflow.int64, shuffle: bool = True, random_seed: Optional[int] = None, split_sizes: Optional[Sequence[str]] = None, split_index: Optional[int] = None, parallel_distribution: Optional[Sequence[str]] = None, start_from_saved_progress: bool = False, name: Optional[str] = None) → oneflow._oneflow_internal.BlobDesc
class oneflow.data.NormByChannelPreprocessor(mean_values: Union[List[float], Tuple[float]], std_values: Union[List[float], Tuple[float]] = (1.0, 1.0, 1.0), data_format: str = 'channels_last')
output_layout() → str
oneflow.data.OFRecordBytesDecoder(input_blob: oneflow._oneflow_internal.BlobDesc, blob_name: str, name: Optional[str] = None) → oneflow._oneflow_internal.BlobDesc
oneflow.data.OFRecordImageDecoder(input_blob: oneflow._oneflow_internal.BlobDesc, blob_name: str, color_space: str = 'BGR', name: Optional[str] = None) → oneflow._oneflow_internal.BlobDesc

This operator is an image decoder.

Parameters
  • input_blob (oneflow._oneflow_internal.BlobDesc) – The input Blob

  • blob_name (str) – The name of the input Blob

  • color_space (str, optional) – The color space, such as “RGB”, “BGR”. Defaults to “BGR”.

  • name (Optional[str], optional) – The name for the operation. Defaults to None.

Returns

The result Blob

Return type

oneflow._oneflow_internal.BlobDesc

For example:

import oneflow as flow
import oneflow.typing as tp
from typing import Tuple


@flow.global_function(type="predict")
def image_decoder_job() -> Tuple[tp.Numpy, tp.Numpy]:
    batch_size = 16
    color_space = "RGB"
    # our ofrecord file path is "./dataset/part-0"
    ofrecord = flow.data.ofrecord_reader(
        "./imgdataset",
        batch_size=batch_size,
        data_part_num=1,
        part_name_suffix_length=-1,
        part_name_prefix='part-',
        random_shuffle=True,
        shuffle_after_epoch=True,
    )
    image = flow.data.OFRecordImageDecoder(
            ofrecord, "encoded", color_space=color_space
        )
    res_image, scale, new_size = flow.image.Resize(
            image, target_size=(224, 224)
        )
    label = flow.data.OFRecordRawDecoder(
        ofrecord, "class/label", shape=(1, ), dtype=flow.int32
    )

    return res_image, label

if __name__ == "__main__":
    images, labels = image_decoder_job()
    # image.shape (16, 224, 224, 3)
oneflow.data.OFRecordImageDecoderRandomCrop(input_blob: oneflow._oneflow_internal.BlobDesc, blob_name: str, color_space: str = 'BGR', num_attempts: int = 10, seed: Optional[int] = None, random_area: Sequence[float] = [0.08, 1.0], random_aspect_ratio: Sequence[float] = [0.75, 1.333333], name: str = 'OFRecordImageDecoderRandomCrop') → oneflow._oneflow_internal.BlobDesc

This operator is an image decoder with random crop.

Parameters
  • input_blob (oneflow._oneflow_internal.BlobDesc) – The input Blob

  • blob_name (str) – The name of the Blob

  • color_space (str, optional) – The color space, such as “RGB”, “BGR”. Defaults to “BGR”.

  • num_attempts (int, optional) – The maximum number of random cropping attempts. Defaults to 10.

  • seed (Optional[int], optional) – The random seed. Defaults to None.

  • random_area (Sequence[float], optional) – The random cropping area. Defaults to [0.08, 1.0].

  • random_aspect_ratio (Sequence[float], optional) – The random scaled ratio. Defaults to [0.75, 1.333333].

  • name (str, optional) – The name for the operation. Defaults to “OFRecordImageDecoderRandomCrop”.

Returns

The random cropped Blob

Return type

oneflow._oneflow_internal.BlobDesc

For example:

import oneflow as flow
import oneflow.typing as tp
from typing import Tuple


@flow.global_function(type="predict")
def ofrecord_reader_job() -> Tuple[tp.Numpy, tp.Numpy]:
    batch_size = 16
    color_space = "RGB"
    # our ofrecord file path is "./dataset/part-0"
    ofrecord = flow.data.ofrecord_reader(
        "./imgdataset",
        batch_size=batch_size,
        data_part_num=1,
        part_name_suffix_length=-1,
        part_name_prefix='part-',
        random_shuffle=True,
        shuffle_after_epoch=True,
    )
    image = flow.data.OFRecordImageDecoderRandomCrop(
            ofrecord, "encoded", color_space=color_space
        )
    res_image, scale, new_size = flow.image.Resize(
            image, target_size=(224, 224)
        )
    label = flow.data.OFRecordRawDecoder(
        ofrecord, "class/label", shape=(1, ), dtype=flow.int32
    )

    return res_image, label

if __name__ == "__main__":
    images, labels = ofrecord_reader_job()
    # images.shape (16, 224, 224, 3)
oneflow.data.OFRecordRawDecoder(input_blob: oneflow._oneflow_internal.BlobDesc, blob_name: str, shape: Sequence[int], dtype: oneflow._oneflow_internal.dtype, dim1_varying_length: bool = False, auto_zero_padding: bool = False, name: Optional[str] = None) → oneflow._oneflow_internal.BlobDesc
oneflow.data.OneRecDecoder(input_blob, key, dtype, shape, is_dynamic=False, reshape=None, batch_padding=None, name=None)
class oneflow.data.RawCodec(auto_zero_padding: bool = False)
oneflow.data.coco_reader(annotation_file: str, image_dir: str, batch_size: int, shuffle: bool = True, random_seed: Optional[int] = None, group_by_aspect_ratio: bool = True, stride_partition: bool = True, remove_images_without_annotations: bool = True, name: str = None) → oneflow._oneflow_internal.BlobDesc
oneflow.data.decode_random(shape: Sequence[int], dtype: oneflow._oneflow_internal.dtype, batch_size: int = 1, initializer: Optional[oneflow.core.job.initializer_conf_pb2.InitializerConf] = None, tick: Optional[oneflow._oneflow_internal.BlobDesc] = None, name: Optional[str] = None) → oneflow._oneflow_internal.BlobDesc
oneflow.data.image_decoder_random_crop_resize(input_blob: oneflow._oneflow_internal.BlobDesc, target_width: int, target_height: int, num_attempts: Optional[int] = None, seed: Optional[int] = None, random_area: Optional[Sequence[float]] = None, random_aspect_ratio: Optional[Sequence[float]] = None, num_workers: Optional[int] = None, warmup_size: Optional[int] = None, max_num_pixels: Optional[int] = None, name: Optional[str] = None) → Tuple[oneflow._oneflow_internal.BlobDesc]
oneflow.data.load_mnist(train_batch_size=100, test_batch_size=100, data_format='NCHW', url='https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist.npz', hash_check='63d4344077849053dc3036b247fa012b2b381de53fd055a66b539dffd76cf08e', out_dir='.')
Load mnist dataset, return images and labels,

if dataset doesn’t exist, then download it to directory that out_dir specified

Parameters
  • train_batch_size (int, optional) – batch size for train. Defaults to 100.

  • test_batch_size (int, optional) – batch size for test or evaluate. Defaults to 100.

  • data_format (str, optional) – data format. Defaults to “NCHW”.

  • url (str, optional) – url to get mnist.npz. Defaults to “https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/mnist.npz”.

  • hash_check (str, optional) – file hash value. Defaults to “63d4344077849053dc3036b247fa012b2b381de53fd055a66b539dffd76cf08e”.

  • out_dir (str, optional) – dir to save downloaded file. Defaults to “./”.

Returns

(train_images, train_labels), (test_images, test_labels)

Return type

[type]

oneflow.data.megatron_gpt_mmap_data_loader(data_file_prefix: str, seq_length: int, num_samples: int, batch_size: int, dtype: oneflow._oneflow_internal.dtype = oneflow.int64, shuffle: bool = True, random_seed: Optional[int] = None, split_sizes: Optional[Sequence[str]] = None, split_index: Optional[int] = None, parallel_distribution: Optional[Sequence[str]] = None, start_from_saved_progress: bool = False, name: Optional[str] = None) → oneflow._oneflow_internal.BlobDesc
oneflow.data.ofrecord_bytes_decoder(input_blob: oneflow._oneflow_internal.BlobDesc, blob_name: str, name: Optional[str] = None) → oneflow._oneflow_internal.BlobDesc
oneflow.data.ofrecord_image_classification_reader(ofrecord_dir: str, image_feature_name: str, label_feature_name: str, batch_size: int = 1, data_part_num: int = 1, part_name_prefix: str = 'part-', part_name_suffix_length: int = -1, random_shuffle: bool = False, shuffle_buffer_size: int = 1024, shuffle_after_epoch: bool = False, color_space: str = 'BGR', decode_buffer_size_per_thread: int = 32, num_decode_threads_per_machine: Optional[int] = None, name: Optional[str] = None) → oneflow._oneflow_internal.BlobDesc

This operator creates a reader for image classification tasks.

Parameters
  • ofrecord_dir (str) – The directory of ofrecord file.

  • image_feature_name (str) – The name of the image feature.

  • label_feature_name (str) – The name of the label feature.

  • batch_size (int, optional) – The batch_size. Defaults to 1.

  • data_part_num (int, optional) – The amounts of data part. Defaults to 1.

  • part_name_prefix (str, optional) – The prefix of data part name. Defaults to “part-“.

  • part_name_suffix_length (int, optional) – The suffix name of data part name. Defaults to -1.

  • random_shuffle (bool, optional) – Whether to random shuffle the data. Defaults to False.

  • shuffle_buffer_size (int, optional) – The buffer size for shuffle data. Defaults to 1024.

  • shuffle_after_epoch (bool, optional) – Whether to shuffle the data after each epoch. Defaults to False.

  • color_space (str, optional) – The color space. Defaults to “BGR”.

  • decode_buffer_size_per_thread (int, optional) – The decode buffer size for per thread. Defaults to 32.

  • num_decode_threads_per_machine (Optional[int], optional) – The amounts of decode threads for each machine. Defaults to None.

  • name (Optional[str], optional) – The name for the operation. Defaults to None.

Returns

The result Blob.

Return type

oneflow._oneflow_internal.BlobDesc

For example:

import oneflow as flow
import oneflow.typing as tp
from typing import Tuple


@flow.global_function(type="predict")
def image_classifier_job() -> Tuple[tp.Numpy, tp.Numpy]:
    image, label = flow.data.ofrecord_image_classification_reader(
        ofrecord_dir="./imgdataset",
        image_feature_name="encoded",
        label_feature_name="class/label",
        batch_size=8,
        data_part_num=1,
        part_name_prefix="part-",
        part_name_suffix_length=-1,
        random_shuffle=False,
        shuffle_after_epoch=False,
        color_space="RGB",
        decode_buffer_size_per_thread=16,
    )
    res_image, scale, new_size = flow.image.Resize(
            image, target_size=(224, 224)
        )
    return res_image, label


if __name__ == "__main__":
    images, labels = image_classifier_job()
    # images.shape (8, 224, 224, 3)
oneflow.data.ofrecord_image_decoder(input_blob: oneflow._oneflow_internal.BlobDesc, blob_name: str, color_space: str = 'BGR', name: Optional[str] = None) → oneflow._oneflow_internal.BlobDesc

This operator is an image decoder.

Parameters
  • input_blob (oneflow._oneflow_internal.BlobDesc) – The input Blob

  • blob_name (str) – The name of the input Blob

  • color_space (str, optional) – The color space, such as “RGB”, “BGR”. Defaults to “BGR”.

  • name (Optional[str], optional) – The name for the operation. Defaults to None.

Returns

The result Blob

Return type

oneflow._oneflow_internal.BlobDesc

For example:

import oneflow as flow
import oneflow.typing as tp
from typing import Tuple


@flow.global_function(type="predict")
def image_decoder_job() -> Tuple[tp.Numpy, tp.Numpy]:
    batch_size = 16
    color_space = "RGB"
    # our ofrecord file path is "./dataset/part-0"
    ofrecord = flow.data.ofrecord_reader(
        "./imgdataset",
        batch_size=batch_size,
        data_part_num=1,
        part_name_suffix_length=-1,
        part_name_prefix='part-',
        random_shuffle=True,
        shuffle_after_epoch=True,
    )
    image = flow.data.OFRecordImageDecoder(
            ofrecord, "encoded", color_space=color_space
        )
    res_image, scale, new_size = flow.image.Resize(
            image, target_size=(224, 224)
        )
    label = flow.data.OFRecordRawDecoder(
        ofrecord, "class/label", shape=(1, ), dtype=flow.int32
    )

    return res_image, label

if __name__ == "__main__":
    images, labels = image_decoder_job()
    # image.shape (16, 224, 224, 3)
oneflow.data.ofrecord_image_decoder_random_crop(input_blob: oneflow._oneflow_internal.BlobDesc, blob_name: str, color_space: str = 'BGR', num_attempts: int = 10, seed: Optional[int] = None, random_area: Sequence[float] = [0.08, 1.0], random_aspect_ratio: Sequence[float] = [0.75, 1.333333], name: str = 'OFRecordImageDecoderRandomCrop') → oneflow._oneflow_internal.BlobDesc

This operator is an image decoder with random crop.

Parameters
  • input_blob (oneflow._oneflow_internal.BlobDesc) – The input Blob

  • blob_name (str) – The name of the Blob

  • color_space (str, optional) – The color space, such as “RGB”, “BGR”. Defaults to “BGR”.

  • num_attempts (int, optional) – The maximum number of random cropping attempts. Defaults to 10.

  • seed (Optional[int], optional) – The random seed. Defaults to None.

  • random_area (Sequence[float], optional) – The random cropping area. Defaults to [0.08, 1.0].

  • random_aspect_ratio (Sequence[float], optional) – The random scaled ratio. Defaults to [0.75, 1.333333].

  • name (str, optional) – The name for the operation. Defaults to “OFRecordImageDecoderRandomCrop”.

Returns

The random cropped Blob

Return type

oneflow._oneflow_internal.BlobDesc

For example:

import oneflow as flow
import oneflow.typing as tp
from typing import Tuple


@flow.global_function(type="predict")
def ofrecord_reader_job() -> Tuple[tp.Numpy, tp.Numpy]:
    batch_size = 16
    color_space = "RGB"
    # our ofrecord file path is "./dataset/part-0"
    ofrecord = flow.data.ofrecord_reader(
        "./imgdataset",
        batch_size=batch_size,
        data_part_num=1,
        part_name_suffix_length=-1,
        part_name_prefix='part-',
        random_shuffle=True,
        shuffle_after_epoch=True,
    )
    image = flow.data.OFRecordImageDecoderRandomCrop(
            ofrecord, "encoded", color_space=color_space
        )
    res_image, scale, new_size = flow.image.Resize(
            image, target_size=(224, 224)
        )
    label = flow.data.OFRecordRawDecoder(
        ofrecord, "class/label", shape=(1, ), dtype=flow.int32
    )

    return res_image, label

if __name__ == "__main__":
    images, labels = ofrecord_reader_job()
    # images.shape (16, 224, 224, 3)
oneflow.data.ofrecord_loader(ofrecord_dir: str, batch_size: int = 1, data_part_num: int = 1, part_name_prefix: str = 'part-', part_name_suffix_length: int = -1, shuffle: bool = False, shuffle_buffer_size: int = 1024, name: Optional[str] = None) → oneflow._oneflow_internal.BlobDesc
oneflow.data.ofrecord_raw_decoder(input_blob: oneflow._oneflow_internal.BlobDesc, blob_name: str, shape: Sequence[int], dtype: oneflow._oneflow_internal.dtype, dim1_varying_length: bool = False, auto_zero_padding: bool = False, name: Optional[str] = None) → oneflow._oneflow_internal.BlobDesc
oneflow.data.ofrecord_reader(ofrecord_dir: str, batch_size: int = 1, data_part_num: int = 1, part_name_prefix: str = 'part-', part_name_suffix_length: int = -1, random_shuffle: bool = False, shuffle_buffer_size: int = 1024, shuffle_after_epoch: bool = False, name: Optional[str] = None) → oneflow._oneflow_internal.BlobDesc

Get ofrecord object from ofrecord dataset.

Parameters
  • ofrecord_dir (str) – Path to ofrecord dataset.

  • batch_size (int, optional) – Batch size. Defaults to 1.

  • data_part_num (int, optional) – Number of dataset’s partitions. Defaults to 1.

  • part_name_prefix (str, optional) – Prefix of dataset’s parition file. Defaults to “part-“.

  • part_name_suffix_length (int, optional) – Total length of padded suffix number , -1 means no padding. eg: 3 for part-001. Defaults to -1.

  • random_shuffle (bool, optional) – Determines records shuffled or not. Defaults to False.

  • shuffle_buffer_size (int, optional) – Shuffle buffer size. Defaults to 1024.

  • shuffle_after_epoch (bool, optional) – Shuffled or not after each epoch. Defaults to False.

  • name (Optional[str], optional) – Optional name. Defaults to None.

Returns

The result Blob

Return type

oneflow._oneflow_internal.BlobDesc

For example:

import oneflow as flow
import oneflow.typing as tp
from typing import Tuple


@flow.global_function(type="predict")
def ofrecord_reader_job() -> Tuple[tp.Numpy, tp.Numpy]:
    batch_size = 16
    with flow.scope.placement("cpu", "0:0"):
        # our ofrecord file path is "./dataset/part-0"
        ofrecord = flow.data.ofrecord_reader(
            "./dataset/",
            batch_size=batch_size,
            data_part_num=1,
            part_name_suffix_length=-1,
            part_name_prefix='part-',
            random_shuffle=True,
            shuffle_after_epoch=True,
        )
        # image shape is (28*28, )
        image = flow.data.OFRecordRawDecoder(
            ofrecord, "images", shape=(784, ), dtype=flow.int32
        )
        # label shape is (1, )
        label = flow.data.OFRecordRawDecoder(
            ofrecord, "labels", shape=(1, ), dtype=flow.int32
        )

        return image, label

if __name__ == "__main__":
    images, labels = ofrecord_reader_job()
    print("In per batch, images shape is", images.shape)
    print("In per batch, labels shape is", labels.shape)

    # In per batch, images shape is (16, 784)
    # In per batch, labels shape is (16, 1)
oneflow.data.onerec_decoder(input_blob, key, dtype, shape, is_dynamic=False, reshape=None, batch_padding=None, name=None)
oneflow.data.onerec_reader(files, batch_size=1, random_shuffle=False, shuffle_mode='instance', shuffle_buffer_size=1024, shuffle_after_epoch=False, verify_example=True, name=None)