{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 画像合成を使ったデータセットの作成" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pascal VoC 2012 Standardを使用して、物体検知で使用するデータセットを作成します。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 事前準備" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install pascal_voc_writer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import glob\n", "import random\n", "import os\n", "import shutil\n", "import argparse\n", "import time\n", "import datetime\n", "import math\n", "import numpy\n", "import cv2\n", "from PIL import Image, ImageFont, ImageDraw\n", "from pascal_voc_writer import Writer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "OBJECT_ORIGIN = (0,0)\n", "\n", "def rand(val):\n", " return int(numpy.random.random() * val)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def syntheticDatasetGen(make_num, out_dir, object_path, algorithm, bkg_dir):\n", " print('Generating...')\n", " G = GenerateData()\n", "\n", " class_names = get_class_name()\n", " print(class_names, flush=True)\n", "\n", " G.genBatch(make_num, out_dir, object_path, class_names, algorithm, bkg_dir)\n", " \n", " \n", "# Get Classes\n", "def get_class_name(): \n", " class_names = []\n", " object_files = os.listdir(object_path)\n", " for object_file in object_files:\n", " class_names.append(object_file)\n", " return sorted(class_names)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class GenerateData:\n", " def __init__(self):\n", " # Starting Class\n", " print (\"Starting class\")\n", " \n", " def euler_to_mat(self, yaw, pitch, roll):\n", " # Rotate clockwise about the Y-axis\n", " c, s = math.cos(yaw), math.sin(yaw)\n", " M = numpy.matrix([[ c, 0., s], [ 0., 1., 0.], [ -s, 0., c]])\n", "\n", " # Rotate clockwise about the X-axis\n", " c, s = math.cos(pitch), math.sin(pitch)\n", " M = numpy.matrix([[ 1., 0., 0.], [ 0., c, -s], [ 0., s, c]]) * M\n", "\n", " # Rotate clockwise about the Z-axis\n", " c, s = math.cos(roll), math.sin(roll)\n", " M = numpy.matrix([[ c, -s, 0.], [ s, c, 0.], [ 0., 0., 1.]]) * M\n", "\n", " return M\n", " \n", " def make_affine_transform(self, from_shape, to_shape, \n", " min_scale, max_scale,\n", " scale_variation=1.0,\n", " rotation_variation=1.0,\n", " translation_variation=1.0):\n", " out_of_bounds = 0\n", "\n", " from_size = numpy.array([[from_shape[1], from_shape[0]]]).T\n", " to_size = numpy.array([[to_shape[1], to_shape[0]]]).T\n", "\n", " scale = random.uniform((min_scale + max_scale) * 0.5 -\n", " (max_scale - min_scale) * 0.5 * scale_variation,\n", " (min_scale + max_scale) * 0.5 +\n", " (max_scale - min_scale) * 0.5 * scale_variation)\n", " if scale > max_scale or scale < min_scale:\n", " out_of_bounds = 1\n", " if scale < min_scale:\n", " out_of_bounds = 1\n", " roll = random.uniform(-1.0, 1.0) * rotation_variation\n", " pitch = random.uniform(-0.15, 0.15) * rotation_variation\n", " yaw = random.uniform(-0.15, 0.15) * rotation_variation\n", "\n", " # Compute a bounding box on the skewed input image (`from_shape`).\n", " M = self.euler_to_mat(yaw, pitch, roll)[:2, :2]\n", " h = from_shape[0]\n", " w = from_shape[1]\n", " corners = numpy.matrix([[-w, +w, -w, +w],\n", " [-h, -h, +h, +h]]) * 0.5\n", " skewed_size = numpy.array(numpy.max(M * corners, axis=1) -\n", " numpy.min(M * corners, axis=1))\n", "\n", " # Set the scale as large as possible such that the skewed and scaled shape\n", " # is less than or equal to the desired ratio in either dimension.\n", " scale *= numpy.min(to_size / skewed_size)\n", "\n", " # Set the translation such that the skewed and scaled image falls within\n", " # the output shape's bounds.\n", " trans = (numpy.random.random((2,1)) - 0.5) * translation_variation\n", " trans = ((2.0 * trans) ** 5.0) / 2.0\n", " if numpy.any(trans < -0.5) or numpy.any(trans > 0.5):\n", " out_of_bounds = 1\n", " trans = (to_size - skewed_size * scale) * trans\n", "\n", " center_to = to_size / 2.\n", " center_from = from_size / 2.\n", "\n", " M = self.euler_to_mat(yaw, pitch, roll)[:2, :2]\n", " M *= scale\n", " M = numpy.hstack([M, trans + center_to - M * center_from])\n", "\n", " return M, out_of_bounds\n", " \n", " def createMask(self, shape, radius):\n", " out = numpy.ones(shape)\n", " return out \n", " \n", " def addObject(self, objectPath, class_name):\n", " print(\"Generating images from class: \" + class_name)\n", " image = random.choice(glob.glob(objectPath + '/' + class_name + '/*.png'))\n", " object = Image.open(image)\n", " object_width, object_height = object.size\n", " self.img=numpy.array(Image.new(\"RGBA\", (object_width, object_height), (0,0,0,0)))\n", " img = self.img\n", " scale = float(object.size[0]/object_width)\n", " new_width = int(object.size[0]/scale)\n", " new_height = int(object.size[1]/scale)\n", " self.object_height = new_height\n", " object = object.resize((new_width, new_height))\n", " pil_img = Image.fromarray(img)\n", " pil_img.paste(object, OBJECT_ORIGIN)\n", " pasted = numpy.array(pil_img)\n", " return (image, pasted, object_width, (object_width, object_height))\n", " \n", " def addGauss(self, img, level):\n", " return cv2.blur(img, (level * 2 + 1, level * 2 + 1))\n", " \n", " def addNoiseSingleChannel(self, single):\n", " diff = 255 - single.max();\n", " noise = numpy.random.normal(0, 1+rand(100), single.shape);\n", " noise = (noise - noise.min())/(noise.max()-noise.min())\n", " noise= diff*noise;\n", " noise= noise.astype(numpy.uint8)\n", " dst = single + noise\n", " return dst\n", " \n", " def addNoise(self, img):\n", " img[:,:,0] = self.addNoiseSingleChannel(img[:,:,0]);\n", " img[:,:,1] = self.addNoiseSingleChannel(img[:,:,1]);\n", " img[:,:,2] = self.addNoiseSingleChannel(img[:,:,2]);\n", " return img;\n", " \n", " def tfactor(self,img):\n", " return img\n", " \n", " def generate_bg(self, bgd_folder, object_shape):\n", " found = False\n", " while not found:\n", " fname = random.choice(glob.glob(bgd_folder + '/*.jpg'))\n", " print('selected {} as background'.format(fname))\n", " bg = cv2.imread(fname, 1)\n", " bg = cv2.cvtColor(bg, cv2.COLOR_BGR2RGB)\n", " bg = cv2.resize(bg, (768, 512))\n", " \n", " #random rotation\n", " rotate_M = cv2.getRotationMatrix2D((bg.shape[1]/2,bg.shape[0]/2),random.randint(0,3) * 90,1)\n", " \n", " if (bg.shape[1] >= object_shape[0] and\n", " bg.shape[0] >= object_shape[1]):\n", " found = True\n", " \n", " return bg\n", " \n", " def genBatch(self, batchSize, outputPath, objectPath, class_names, algorithm, bgd_folder):\n", " \n", " if os.path.exists(outputPath):\n", " shutil.rmtree(outputPath)\n", "\n", " if not os.path.exists(outputPath):\n", " os.makedirs(outputPath)\n", " \n", " if not os.path.exists(outputPath + '/OD/VOC2012' + '/JPEGImages'):\n", " os.makedirs(outputPath + '/OD/VOC2012' + '/JPEGImages')\n", " \n", " if not os.path.exists(outputPath + '/OD' + '/Annotations'):\n", " os.makedirs(outputPath + '/OD/VOC2012' + '/Annotations')\n", " \n", " if not os.path.exists(outputPath + '/OD' + '/ImageSets'):\n", " os.makedirs(outputPath + '/OD/VOC2012' + '/ImageSets')\n", " \n", " if not os.path.exists(outputPath + '/OD' + '/ImageSets/Main'):\n", " os.makedirs(outputPath + '/OD/VOC2012' + '/ImageSets/Main')\n", "\n", " if not os.path.exists(outputPath + '/IC'):\n", " os.makedirs(outputPath + '/IC')\n", " \n", " main_val_file = open(outputPath + '/OD/VOC2012/ImageSets/Main/val.txt','a')\n", " main_train_file = open(outputPath + '/OD/VOC2012/ImageSets/Main/train.txt','a')\n", " \n", " gen_log_file = open(outputPath+'/gen.log','w')\n", " \n", " for class_name_idx, class_name in enumerate(class_names):\n", " for i in range(batchSize):\n", "\n", " imagename, generatedData, object_width, object_shape = self.addObject(objectPath, class_name)\n", "\n", " self.bkg = self.generate_bg(bgd_folder, object_shape)\n", "\n", " objectMask = self.createMask(generatedData.shape, 40)\n", " generatedBackground = self.bkg\n", "\n", " M, out_of_bounds = self.make_affine_transform(\n", " from_shape=generatedData.shape,\n", " to_shape=generatedBackground.shape,\n", " min_scale=0.10,\n", " max_scale=0.17,\n", " rotation_variation=3.5,\n", " scale_variation=2.0,\n", " translation_variation=0.98)\n", "\n", " bkgFromArray = Image.fromarray(generatedBackground)\n", " bkgFromArray = bkgFromArray.convert('RGBA')\n", " generatedBackground = numpy.array(bkgFromArray)\n", "\n", " object_topleft = tuple(M.dot(numpy.array((OBJECT_ORIGIN[0],OBJECT_ORIGIN[1]) + (1,))).tolist()[0])\n", " object_topright = tuple(M.dot(numpy.array((OBJECT_ORIGIN[0]+object_width,OBJECT_ORIGIN[1]) + (1,))).tolist()[0])\n", " object_bottomleft = tuple(M.dot(numpy.array((OBJECT_ORIGIN[0],OBJECT_ORIGIN[1]+self.object_height) + (1,))).tolist()[0])\n", " object_bottomright = tuple(M.dot(numpy.array((OBJECT_ORIGIN[0]+object_width,OBJECT_ORIGIN[1]+self.object_height) + (1,))).tolist()[0])\n", "\n", " object_tups = (object_topleft, object_topright, object_bottomleft, object_bottomright)\n", " object_xmin = (min(object_tups, key=lambda item:item[0])[0])\n", " object_xmax = (max(object_tups, key=lambda item:item[0])[0])\n", " object_ymin = (min(object_tups, key=lambda item:item[1])[1])\n", " object_ymax = (max(object_tups, key=lambda item:item[1])[1])\n", "\n", " generatedData = cv2.warpAffine(generatedData, M, (generatedBackground.shape[1], generatedBackground.shape[0]))\n", " objectMask = cv2.warpAffine(objectMask, M, (generatedBackground.shape[1], generatedBackground.shape[0]))\n", "\n", " # light condition\n", " #generatedData = self.tfactor(generatedData)\n", " \n", " # merge images\n", " bg_pil = Image.fromarray(generatedBackground)\n", " object_pil = Image.fromarray(generatedData)\n", " bg_pil.paste(object_pil, (0, 0), object_pil)\n", " out = numpy.array(bg_pil)\n", "\n", " # gauss\n", " out = self.addGauss(out, 0+rand(3))\n", " out = out.astype('float64')\n", " \n", " ### Add Noise\n", " out = self.addNoise(out)\n", " \n", " initial_val = '1'\n", " total_index = (class_name_idx * batchSize) + i\n", "\n", " img_filename = os.path.join(outputPath + '/OD/VOC2012/JPEGImages', initial_val + str(total_index).zfill(5) + '.jpg')\n", " xml_filename = os.path.join(outputPath + '/OD/VOC2012/Annotations', initial_val + str(total_index).zfill(5) + '.xml')\n", "\n", " pil_image = Image.fromarray(out.astype('uint8'))\n", " pil_image.save(img_filename, format='PNG', subsampling=0, quality=100)\n", " \n", " annotator = Writer(img_filename, pil_image.size[0], pil_image.size[1])\n", " annotator.addObject(class_name,object_xmin,object_ymin,object_xmax,object_ymax)\n", " annotator.save(xml_filename)\n", "\n", " if 'IC' in algorithm: \n", " if not os.path.exists(outputPath + '/IC/' + class_name):\n", " os.makedirs(outputPath + '/IC/' + class_name)\n", "\n", " # Crop Image\n", " image_crop = pil_image.crop((object_xmin, object_ymin, object_xmax, object_ymax))\n", " image_crop = image_crop.convert(\"RGB\")\n", " image_crop.save(outputPath+'/IC/' + class_name + \"/\" + initial_val + str(total_index).zfill(5) + '.jpg', format=\"JPEG\")\n", "\n", " if i % (batchSize / 10) == 0:\n", " unformatted_ts = datetime.datetime.fromtimestamp(time.time())\n", " ts = unformatted_ts.strftime('%Y-%m-%d %H:%M:%S')\n", " log_debug_string = '### {} ### Generated Files: {}, {}\\n'.format(ts, img_filename, xml_filename)\n", " gen_log_file.write(log_debug_string)\n", " print(log_debug_string)\n", " \n", " is_train_id = (i < batchSize * 0.8)\n", " if is_train_id:\n", " main_train_file.write(initial_val + str(total_index).zfill(5) + '\\n')\n", " else:\n", " main_val_file.write(initial_val + str(total_index).zfill(5) + '\\n')\n", " \n", " for class_name_file in class_names:\n", " object_val_file = open(outputPath + '/OD/VOC2012/ImageSets/Main/' + class_name_file + '_val.txt','a')\n", " object_train_file = open(outputPath + '/OD/VOC2012/ImageSets/Main/' + class_name_file + '_train.txt','a')\n", "\n", " presence_val = ' -1\\n'\n", "\n", " if class_name == class_name_file: \n", " presence_val = ' 1\\n'\n", " \n", " if is_train_id:\n", " object_train_file.write(initial_val + str(total_index).zfill(5) + presence_val)\n", " else:\n", " object_val_file.write(initial_val + str(total_index).zfill(5) + presence_val)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### パラメータ定義" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Number of pictures per class\n", "make_num = 1000\n", "\n", "## Background Folder\n", "bkg_dir = './backgrounds'\n", "\n", "## Objects Folder\n", "object_path = './bottlecaps'\n", "\n", "## Output directory\n", "out_dir = './dataset'\n", "\n", "## Algorithm Selection. Object Detection = OD\n", "algorithm = ['OD']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ベース画像を作成する\n", "\n", "##### 背景画像\n", "\n", "冷蔵庫内のカメラから撮影した写真を使用します。このときに他の被写体が映らないようにします。\n", "さまざまな照明条件とカメラの露出設定でいくつか写真を撮影します。冷蔵庫が複数段になる場合は、棚毎に写真を撮影します。\n", "backgroundフォルダにある写真はランダムに使用されます。\n", "画像形式は、JPEG(.jpg)にするようにして下さい。\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### オブジェクトの写真\n", "背景のないボトルキャップの写真を使用します。\n", "各オブジェクトクラスの写真を最低20種類以上のバリデーションを使用することを推奨します。各サンプルの写真は様々な角度、照明と眩しさで取得する必要があります。もしオブジェクトが反響の影響を受ける場合は、異なるサンプルを出来る限り多く取得してください。\n", "各クラスには、オブジェクトフォルダ内に独自のフォルダが必要です。クラスフォルダ内の画像がランダムに使用されます。トリミングされたボトルキャップの画像形式は、PNG(.png)である必要があります。\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### データセットの可視化" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 全データセットの生成" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "各オブジェクトクラス毎に1000枚以上のイメージを作成します。20-30分程時間がかかります。" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "from PIL import Image, ImageFont, ImageDraw\n", "syntheticDatasetGen(make_num, out_dir, object_path, algorithm, bkg_dir)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# RecordIOファイル生成" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "データセットの生成後、トレーニングデータとして使用するために、RecordIOファイルに変換する必要があります。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### ツールファイルの生成" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Tools folder\n", "if os.path.exists('tools'):\n", " shutil.rmtree('tools')\n", " \n", "## Get and unzip tools files\n", "!unzip tools.zip\n", "\n", "## RecordIO Folder \n", "if not os.path.exists('RecordIO'):\n", " os.makedirs('RecordIO')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### クラス名取得 (サンプルデータセットを使用した場合、\"coca-cola\" \"fppuccino\" \"pepsi\" \"pure-life\"が作成される" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class_names = get_class_name()\n", "class_names = ', '.join(class_names).strip(\" \").replace(\" \", \"\")\n", "print(class_names)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### データセットから、RecordIOファイルへ変換" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "%%bash -s \"$class_names\" \"$out_dir\"\n", "python tools/prepare_dataset.py --dataset pascal --year 2012 --class-names $1 --set train --target RecordIO/train.lst --root $2/OD --true-negative false\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%bash -s \"$class_names\" \"$out_dir\"\n", "python tools/prepare_dataset.py --dataset pascal --year 2012 --class-names $1 --set val --target RecordIO/val.lst --root $2/OD/ --true-negative false \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_amazonei_mxnet_p36", "language": "python", "name": "conda_amazonei_mxnet_p36" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 4 }