制作数据集(2)

调试成功一个函数,可以把ndarray格式数组保存为csv

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def save_to_csv(output_dir, data, name_prefix,
header=None, n_parts=10):
"""
numpy.ndarray ----> save to csv
"""
path_format = os.path.join(output_dir, "{}_{:02d}.csv")
filenames = []

for file_idx, row_indices in enumerate(
np.array_split(np.arange(len(data)), n_parts)):
part_csv = path_format.format(name_prefix, file_idx)
filenames.append(part_csv)
with open(part_csv, "wt", encoding="utf-8") as f:
if header is not None:
f.write(header + "\n")
for row_index in row_indices:
f.write(",".join(
[repr(col) for col in data[row_index]]))
f.write('\n')
return filenames

调用函数,使用fetch_california_housing数据集进行测试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
output_dir = "/Users/squareface/code/generate_csv"
if not os.path.exists(output_dir):
os.mkdir(output_dir)

train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]

header_cols = housing.feature_names + ["MidianHouseValue"]
head_str = ",".join(header_cols)

train_filenames = save_to_csv(output_dir, train_data, "train",
head_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, "valid",
head_str, n_parts=10)
test_filenames = save_to_csv(output_dir, test_data, "test",
head_str, n_parts=10)

Tensorflow

阅读大牛的code,tf.dataAPI在读取csv文件比较便捷,去翻看下Tensorflow2.0的API文档。

tf读取一个文件夹里所有的csv文件方法:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# first step: filename ----> filenames
# second step: read file ----> dataset ----> datasets ----> merge
# third step: parse csv
def parse_csv_line(line, n_fields = 9):
defs = [tf.constant(np.nan)] * n_fields
parsed_fields = tf.io.decode_csv(line, record_defaults=defs)
x = tf.stack(parsed_fields[0:-1])
y = tf.stack(parsed_fields[-1:])
return x, y

# parse_csv_line(b'-0.9868720801669367,0.832863080552588,-0.18684708416901633,-0.14888949288707784,-0.4532302419670616,-0.11504995754593579,1.6730974284189664,-0.7465496877362412,1.138',
# n_fields=9)

def csv_reader_dataset(filenames, n_readers=5,
batch_size=32, n_parse_threads=5,
shuffle_buffer_size=10000):
dataset = tf.data.Dataset.list_files(filenames)
dataset = dataset.repeat()
dataset = dataset.interleave(
lambda filename: tf.data.TextLineDataset(filename).skip(1),
cycle_length = n_readers
)
dataset.shuffle(shuffle_buffer_size)
dataset = dataset.map(parse_csv_line,
num_parallel_calls=n_parse_threads)
dataset = dataset.batch(batch_size)
return dataset

train_set = csv_reader_dataset(train_filenames, batch_size=3)
for x_batch, y_batch in train_set.take(2):
print("x:")
pprint.pprint(x_batch)
print("y:")
pprint.pprint(y_batch)