Facebook
Twitterhttp://www.gnu.org/licenses/lgpl-3.0.htmlhttp://www.gnu.org/licenses/lgpl-3.0.html
global_test_dataset = tf.keras.preprocessing.image_dataset_from_directory(BASE_DIR+'/test', label_mode=None, shuffle=False, batch_size=1, image_size=(224, 224))
filepath = [x[:-4] for x in map(os.path.basename, global_test_dataset.file_paths)]
filepath_ds = tf.data.Dataset.from_tensor_slices(filepath)
dev_test_dataset = tf.data.Dataset.zip((global_test_dataset.unbatch(), filepath_ds))
global_test_dataset_size = len(filepath)
print('test images: ', global_test_dataset_size)
with tf.io.TFRecordWriter('landmark-recognition-2021-test.tfrec') as file_writer:
for img, path in tqdm(dev_test_dataset.as_numpy_iterator(), total=global_test_dataset_size):
img = tf.cast(tf.image.resize(img, [224, 224], method='nearest'), 'uint8')
img_jpeg = tf.io.encode_jpeg(img, quality=70, optimize_size=True).numpy()
record_bytes = tf.train.Example(features=tf.train.Features(feature={
'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[img_jpeg])),
'id': tf.train.Feature(bytes_list=tf.train.BytesList(value=[path])),
})).SerializeToString()
file_writer.write(record_bytes)
def decode_tfrecord(record_bytes):
features = tf.io.parse_single_example(record_bytes, {
'image': tf.io.FixedLenFeature([], tf.string),
'id': tf.io.FixedLenFeature([], tf.string)
})
img = tf.io.decode_jpeg(features['image'])
img = tf.reshape(img, [224, 224, 3])
return {'image': img, 'id': features['id']}
FNAMES_TRAIN_TFRECORDS = np.sort(tf.io.gfile.glob(BASE_DIR+'/landmark-recognition-2021-test.tfrec'))
global_train_ds = tf.data.TFRecordDataset(FNAMES_TRAIN_TFRECORDS, num_parallel_reads=None)
global_train_ds = global_train_ds.map(decode_tfrecord, num_parallel_calls=AUTO)
Facebook
Twitter40,000 lines of Shakespeare from a variety of Shakespeare's plays. Featured in Andrej Karpathy's blog post 'The Unreasonable Effectiveness of Recurrent Neural Networks': http://karpathy.github.io/2015/05/21/rnn-effectiveness/.
To use for e.g. character modelling:
d = tfds.load(name='tiny_shakespeare')['train']
d = d.map(lambda x: tf.strings.unicode_split(x['text'], 'UTF-8'))
# train split includes vocabulary for other splits
vocabulary = sorted(set(next(iter(d)).numpy()))
d = d.map(lambda x: {'cur_char': x[:-1], 'next_char': x[1:]})
d = d.unbatch()
seq_len = 100
batch_size = 2
d = d.batch(seq_len)
d = d.batch(batch_size)
To use this dataset:
import tensorflow_datasets as tfds
ds = tfds.load('tiny_shakespeare', split='train')
for ex in ds.take(4):
print(ex)
See the guide for more informations on tensorflow_datasets.
Facebook
Twitterhttp://www.gnu.org/licenses/lgpl-3.0.htmlhttp://www.gnu.org/licenses/lgpl-3.0.html
for i in range(16):
idx = hex(i)[2:]
record_ds = tf.keras.preprocessing.image_dataset_from_directory(BASE_DIR+'/train/'+idx, label_mode=None, shuffle=False, batch_size=1, image_size=(224, 224))
chunk_size = len(record_ds.file_paths)
record_ds = record_ds.unbatch()
with tf.io.TFRecordWriter('landmark-recognition-2021-part'+idx+'-'+str(chunk_size)+'.tfrec') as file_writer:
for img in tqdm(record_ds.as_numpy_iterator(), total=chunk_size):
img = tf.cast(tf.image.resize(img, [224, 224], method='nearest'), 'uint8')
img_jpeg = tf.io.encode_jpeg(img, quality=70, optimize_size=True).numpy()
record_bytes = tf.train.Example(features=tf.train.Features(feature={
'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[img_jpeg]))
})).SerializeToString()
file_writer.write(record_bytes)
The images in the dataset will correspond one-by-one to the labels in CSV file. ```python def decode_tfrecord(record_bytes): features = tf.io.parse_single_example(record_bytes, { 'image': tf.io.FixedLenFeature([], tf.string), }) img = tf.io.decode_jpeg(features['image']) img = tf.reshape(img, [224, 224, 3]) return img
FNAMES_TRAIN_TFRECORDS = np.sort(tf.io.gfile.glob(BASE_DIR+'/*.tfrec')) global_train_ds = tf.data.TFRecordDataset(FNAMES_TRAIN_TFRECORDS, num_parallel_reads=None) global_train_ds = global_train_ds.map(decode_tfrecord, num_parallel_calls=AUTO)
labels = pd.read_csv(BASE_DIR+'/train.csv', index_col='id') labels.sort_index(inplace=True) labels['label'] = labels['landmark_id'].astype('category').cat.codes
labels_ds = tf.data.Dataset.from_tensor_slices(labels['label'].values) dev_dataset = tf.data.Dataset.zip((global_train_ds, labels_ds)) ```
Not seeing a result you expected?
Learn how you can add new datasets to our index.
Facebook
Twitterhttp://www.gnu.org/licenses/lgpl-3.0.htmlhttp://www.gnu.org/licenses/lgpl-3.0.html
global_test_dataset = tf.keras.preprocessing.image_dataset_from_directory(BASE_DIR+'/test', label_mode=None, shuffle=False, batch_size=1, image_size=(224, 224))
filepath = [x[:-4] for x in map(os.path.basename, global_test_dataset.file_paths)]
filepath_ds = tf.data.Dataset.from_tensor_slices(filepath)
dev_test_dataset = tf.data.Dataset.zip((global_test_dataset.unbatch(), filepath_ds))
global_test_dataset_size = len(filepath)
print('test images: ', global_test_dataset_size)
with tf.io.TFRecordWriter('landmark-recognition-2021-test.tfrec') as file_writer:
for img, path in tqdm(dev_test_dataset.as_numpy_iterator(), total=global_test_dataset_size):
img = tf.cast(tf.image.resize(img, [224, 224], method='nearest'), 'uint8')
img_jpeg = tf.io.encode_jpeg(img, quality=70, optimize_size=True).numpy()
record_bytes = tf.train.Example(features=tf.train.Features(feature={
'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[img_jpeg])),
'id': tf.train.Feature(bytes_list=tf.train.BytesList(value=[path])),
})).SerializeToString()
file_writer.write(record_bytes)
def decode_tfrecord(record_bytes):
features = tf.io.parse_single_example(record_bytes, {
'image': tf.io.FixedLenFeature([], tf.string),
'id': tf.io.FixedLenFeature([], tf.string)
})
img = tf.io.decode_jpeg(features['image'])
img = tf.reshape(img, [224, 224, 3])
return {'image': img, 'id': features['id']}
FNAMES_TRAIN_TFRECORDS = np.sort(tf.io.gfile.glob(BASE_DIR+'/landmark-recognition-2021-test.tfrec'))
global_train_ds = tf.data.TFRecordDataset(FNAMES_TRAIN_TFRECORDS, num_parallel_reads=None)
global_train_ds = global_train_ds.map(decode_tfrecord, num_parallel_calls=AUTO)