博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Python多进程加速大量图片数据集读取
阅读量:4142 次
发布时间:2019-05-25

本文共 5974 字,大约阅读时间需要 19 分钟。

训练机器视觉神经网络前 多进程加速数据集读取

多进程读取图片并压缩.py

import readImgMultiProcessing, os, random, gc, timefrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import LabelEncoderfrom imutils import pathsimport numpy as npseed = 300random.seed(seed)coreNum = 23 imgPath = rdataPath = r'dataTestPath = rdataEncoderPath = r'babyFaceLabelEncoder.data'config = {
"epochs": 10, "batch_size": 128, 'useIDG':True, "imageResize": (600, 600), "lr": 1e-3}if (os.path.exists(r'tmp') == False): os.makedirs(r'tmp')if __name__ == '__main__': if os.path.exists(dataPath) and os.path.exists(dataTestPath): (X_train, X_val, y_train, y_val) = readImgMultiProcessing.readFile(dataPath) (X_test, Y_test) = readImgMultiProcessing.readFile(dataTestPath) class_le = readImgMultiProcessing.readFile(dataEncoderPath) else: tst = time.time() class_le = LabelEncoder() class_le.fit(['睡', '醒']) # 读取所有的图片路径 imagePaths = sorted(list(paths.list_images(imgPath))) # 读取婴儿醒睡的图片 faceImagePaths = [] for img_path in imagePaths: if(img_path.split(os.path.sep)[-2] == '睡' or img_path.split(os.path.sep)[-2] == '醒'): faceImagePaths.append(img_path) # 打乱顺序 random.shuffle(faceImagePaths) # 这里就读取部分,内存不够 # faceImagePaths = faceImagePaths[:234] # 用train_test_split划分 训练 验证 测试,注意:这里划分的是 路径 trainImgPaths, testImgPaths = train_test_split(faceImagePaths, test_size=0.2, random_state=seed) trainImgPaths, valImgPaths = train_test_split(trainImgPaths, test_size=0.2, random_state=seed) print('数据列表划分完成') # 图片 X_train = readImgMultiProcessing.readImgMultiProcessing(trainImgPaths, coreNum, config) print('训练集读入完成') X_val = readImgMultiProcessing.readImgMultiProcessing(valImgPaths, coreNum, config) print('验证集读入完成') X_test = readImgMultiProcessing.readImgMultiProcessing(testImgPaths, coreNum, config) print('测试集读入完成') X_train = np.asarray(X_train, dtype=np.float) / 255.0 print('训练集处理完成') X_val = np.asarray(X_val, dtype=np.float) / 255.0 print('验证集处理完成') X_test = np.array( X_test, dtype=np.float) / 255.0 print('测试集处理完成') # 婴儿表情的标签 y_train = class_le.transform([x.split(os.path.sep)[-2] for x in trainImgPaths]) y_val = class_le.transform([x.split(os.path.sep)[-2] for x in valImgPaths]) Y_test = class_le.transform([x.split(os.path.sep)[-2] for x in testImgPaths]) # 保存数据 readImgMultiProcessing.toFile(class_le, dataEncoderPath) readImgMultiProcessing.toFile((X_train, X_val, y_train, y_val), dataPath) readImgMultiProcessing.toFile((X_test, Y_test), dataTestPath) print('数据集压缩成功,数据保存完毕') print(len(trainImgPaths), X_train.shape, len(y_train)) print(len(valImgPaths), X_val.shape, len(y_val)) print(len(testImgPaths), X_test.shape, len(Y_test)) print('用时', time.time()-tst) # 934s

readImgMultiProcessing.py

from multiprocessing import Process,Queue,Pool,Pipe,Managerimport os,time,randomfrom imutils import pathsimport numpy as npfrom PIL import Imageimport pickleimport cv2def toFile(data,path):    with open(path, 'wb') as f:        pickle.dump(data, f, protocol = 4) #支持大于4G的文件def readFile(path):    f = open(path, 'rb')    data = pickle.load(f)    f.close()    return datadef resize_img_keep_ratio(img_name, target_size):    '''    1.resize图片,先计算最长边的resize的比例,然后按照该比例resize。    2.计算四个边需要padding的像素宽度,然后padding    '''    try:        # 用cv2&numpy打开,这样可以读取中文路径        # img = cv2.imdecode(np.fromfile(img_name, dtype=np.uint8), -1)         # 使用PIL读取图片,防止中文路径和png格式的报错        im = Image.open(img_name)        # 转化成数组的格式        im_array = np.array(im)    # 报错提示    except Exception as e:        print(img_name, e)    old_size = im_array.shape[0:2]    ratio = min(float(target_size[i])/(old_size[i]) for i in range(len(old_size)))    new_size = tuple([int(i*ratio) for i in old_size])    img = cv2.resize(im_array,(new_size[1], new_size[0]),interpolation=cv2.INTER_CUBIC)  #注意插值算法    pad_w = target_size[1] - new_size[1]    pad_h = target_size[0] - new_size[0]    top,bottom = pad_h//2, pad_h-(pad_h//2)    left,right = pad_w//2, pad_w -(pad_w//2)    # 填充图片,黑边填充    img_new = cv2.copyMakeBorder(img,top,bottom,left,right,cv2.BORDER_CONSTANT,None,(0,0,0))    if(img_name.count('.png')==1 or img_new.shape[-1]==4):        return cv2.cvtColor(img_new, cv2.COLOR_RGBA2RGB)    return img_new  def getData(num, paths, return_dict, config):    Data = []    for img_path in paths:        img = resize_img_keep_ratio(img_path, (config["imageResize"][0], config["imageResize"][1]))        Data.append(img)    Data = np.array(Data, dtype=np.float)    Data /= 255.0    return_dict[num] = Data        def readImgMultiProcessing(imagePaths, coreNum, config):    # 路径的划分    lenPerSt= int(len(imagePaths)/coreNum+1)    paths = []    for i in range(coreNum):        paths.append(imagePaths[i*lenPerSt:(i+1)*lenPerSt])    # 多进程返回值接收器    manager = Manager()    return_dict = manager.dict()    jobs = []    # 执行进程    for i in range(coreNum):        p = Process(target=getData,args=(str(i), paths[i], return_dict, config))        jobs.append(p)        p.start()    for proc in jobs:        proc.join()    # 合并数据    data = np.asarray((list(return_dict['0'])))    for i in range(1,coreNum):        x = np.asarray((list(return_dict[str(i)])))        if(int(x.shape[0])>0):            data = np.concatenate((data,x))    return data# if __name__ == '__main__':#     imgPath = r'E:\新的数据集\'#     imagePaths = sorted(list(paths.list_images(imgPath)))[:100]#     config = {"epochs": 10, "batch_size": 128, 'useIDG':True, # False True#             "imageResize": (600, 600), "lr": 1e-3}#     coreNum = 10#     data = readImgMultiProcessing(imagePaths, coreNum, config)#     print(data.shape)

转载地址:http://irzti.baihongyu.com/

你可能感兴趣的文章
hadoop安装之-hive
查看>>
hadoop安装之-hadoop
查看>>
hadoop安装之-hbase
查看>>
Mysql自动化安装
查看>>
MySQL的备份方式
查看>>
为 Key-Value 数据库实现MVCC 事务
查看>>
10 种机器学习算法的要点(附 Python 和 R 代码)
查看>>
不同场景下 MySQL 的迁移方案
查看>>
MySQL备份与恢复
查看>>
REST简介
查看>>
MySQL 高可用架构在业务层面的分析研究
查看>>
构建高并发高可用的电商平台架构实践
查看>>
傅里叶变换
查看>>
有趣的机器学习:最简明入门指南
查看>>
四层和七层负载均衡的区别
查看>>
Spark:一个高效的分布式计算系统
查看>>
zabbix安装
查看>>
解决zabbix图中出现中文乱码问题
查看>>
zabbix监控mysql客户端
查看>>
windows文件上传到linux系统的文件内容问题
查看>>