Python多进程加速大量图片数据集读取-白红宇

Python多进程加速大量图片数据集读取

阅读量：4142 次

发布时间：2019-05-25

本文共 5974 字，大约阅读时间需要 19 分钟。

训练机器视觉神经网络前多进程加速数据集读取

多进程读取图片并压缩.py

import readImgMultiProcessing, os, random, gc, timefrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import LabelEncoderfrom imutils import pathsimport numpy as npseed = 300random.seed(seed)coreNum = 23 imgPath = rdataPath = r'dataTestPath = rdataEncoderPath = r'babyFaceLabelEncoder.data'config = {
   "epochs": 10, "batch_size": 128, 'useIDG':True, "imageResize": (600, 600), "lr": 1e-3}if (os.path.exists(r'tmp') == False):    os.makedirs(r'tmp')if __name__ == '__main__':    if os.path.exists(dataPath) and os.path.exists(dataTestPath):        (X_train, X_val, y_train, y_val)    = readImgMultiProcessing.readFile(dataPath)        (X_test, Y_test)                    = readImgMultiProcessing.readFile(dataTestPath)        class_le                            = readImgMultiProcessing.readFile(dataEncoderPath)    else:        tst = time.time()        class_le = LabelEncoder()        class_le.fit(['睡', '醒'])        # 读取所有的图片路径        imagePaths = sorted(list(paths.list_images(imgPath)))        # 读取婴儿醒睡的图片        faceImagePaths = []        for img_path in imagePaths:            if(img_path.split(os.path.sep)[-2] == '睡' or img_path.split(os.path.sep)[-2] == '醒'):                faceImagePaths.append(img_path)        # 打乱顺序        random.shuffle(faceImagePaths)        # 这里就读取部分，内存不够        # faceImagePaths = faceImagePaths[:234]        # 用train_test_split划分 训练 验证 测试，注意：这里划分的是 路径         trainImgPaths, testImgPaths = train_test_split(faceImagePaths, test_size=0.2, random_state=seed)        trainImgPaths, valImgPaths  = train_test_split(trainImgPaths,  test_size=0.2, random_state=seed)        print('数据列表划分完成')        # 图片        X_train     = readImgMultiProcessing.readImgMultiProcessing(trainImgPaths, coreNum, config)        print('训练集读入完成')        X_val       = readImgMultiProcessing.readImgMultiProcessing(valImgPaths,   coreNum, config)        print('验证集读入完成')        X_test      = readImgMultiProcessing.readImgMultiProcessing(testImgPaths,  coreNum, config)        print('测试集读入完成')        X_train     = np.asarray(X_train,   dtype=np.float) / 255.0        print('训练集处理完成')        X_val       = np.asarray(X_val,     dtype=np.float) / 255.0        print('验证集处理完成')        X_test      = np.array(  X_test,    dtype=np.float) / 255.0        print('测试集处理完成')        # 婴儿表情的标签        y_train = class_le.transform([x.split(os.path.sep)[-2] for x in trainImgPaths])        y_val   = class_le.transform([x.split(os.path.sep)[-2] for x in valImgPaths])        Y_test  = class_le.transform([x.split(os.path.sep)[-2] for x in testImgPaths])        # 保存数据        readImgMultiProcessing.toFile(class_le, dataEncoderPath)        readImgMultiProcessing.toFile((X_train, X_val, y_train, y_val), dataPath)        readImgMultiProcessing.toFile((X_test, Y_test), dataTestPath)        print('数据集压缩成功，数据保存完毕')        print(len(trainImgPaths), X_train.shape, len(y_train))        print(len(valImgPaths), X_val.shape, len(y_val))        print(len(testImgPaths), X_test.shape, len(Y_test))        print('用时', time.time()-tst) # 934s

readImgMultiProcessing.py

from multiprocessing import Process,Queue,Pool,Pipe,Managerimport os,time,randomfrom imutils import pathsimport numpy as npfrom PIL import Imageimport pickleimport cv2def toFile(data,path):    with open(path, 'wb') as f:        pickle.dump(data, f, protocol = 4) #支持大于4G的文件def readFile(path):    f = open(path, 'rb')    data = pickle.load(f)    f.close()    return datadef resize_img_keep_ratio(img_name, target_size):    '''    1.resize图片，先计算最长边的resize的比例，然后按照该比例resize。    2.计算四个边需要padding的像素宽度，然后padding    '''    try:        # 用cv2&numpy打开，这样可以读取中文路径        # img = cv2.imdecode(np.fromfile(img_name, dtype=np.uint8), -1)         # 使用PIL读取图片，防止中文路径和png格式的报错        im = Image.open(img_name)        # 转化成数组的格式        im_array = np.array(im)    # 报错提示    except Exception as e:        print(img_name, e)    old_size = im_array.shape[0:2]    ratio = min(float(target_size[i])/(old_size[i]) for i in range(len(old_size)))    new_size = tuple([int(i*ratio) for i in old_size])    img = cv2.resize(im_array,(new_size[1], new_size[0]),interpolation=cv2.INTER_CUBIC)  #注意插值算法    pad_w = target_size[1] - new_size[1]    pad_h = target_size[0] - new_size[0]    top,bottom = pad_h//2, pad_h-(pad_h//2)    left,right = pad_w//2, pad_w -(pad_w//2)    # 填充图片，黑边填充    img_new = cv2.copyMakeBorder(img,top,bottom,left,right,cv2.BORDER_CONSTANT,None,(0,0,0))    if(img_name.count('.png')==1 or img_new.shape[-1]==4):        return cv2.cvtColor(img_new, cv2.COLOR_RGBA2RGB)    return img_new  def getData(num, paths, return_dict, config):    Data = []    for img_path in paths:        img = resize_img_keep_ratio(img_path, (config["imageResize"][0], config["imageResize"][1]))        Data.append(img)    Data = np.array(Data, dtype=np.float)    Data /= 255.0    return_dict[num] = Data        def readImgMultiProcessing(imagePaths, coreNum, config):    # 路径的划分    lenPerSt= int(len(imagePaths)/coreNum+1)    paths = []    for i in range(coreNum):        paths.append(imagePaths[i*lenPerSt:(i+1)*lenPerSt])    # 多进程返回值接收器    manager = Manager()    return_dict = manager.dict()    jobs = []    # 执行进程    for i in range(coreNum):        p = Process(target=getData,args=(str(i), paths[i], return_dict, config))        jobs.append(p)        p.start()    for proc in jobs:        proc.join()    # 合并数据    data = np.asarray((list(return_dict['0'])))    for i in range(1,coreNum):        x = np.asarray((list(return_dict[str(i)])))        if(int(x.shape[0])>0):            data = np.concatenate((data,x))    return data# if __name__ == '__main__':#     imgPath = r'E:\新的数据集\'#     imagePaths = sorted(list(paths.list_images(imgPath)))[:100]#     config = {"epochs": 10, "batch_size": 128, 'useIDG':True, # False True#             "imageResize": (600, 600), "lr": 1e-3}#     coreNum = 10#     data = readImgMultiProcessing(imagePaths, coreNum, config)#     print(data.shape)

转载地址：http://irzti.baihongyu.com/

你可能感兴趣的文章