引言
Quick Draw Dataset 是一个包含345 个类别的 5000 万幅绘图的集合,由游戏Quick, Draw! 的玩家贡献!. 这些绘图被捕获为带时间戳的矢量,并用元数据标记,包括要求玩家绘制的内容以及玩家所在的国家/地区。您可以在quickdraw.withgoogle.com/data上浏览已识别的图纸。
因为我需要对简笔画之类的数据集做分类,但是我手上的数据集太小,就需要大量的数据集做个大模型,然后在微调
正文
步骤:ndjson转可以先转换成json,然后json在转换成png。
我会先把单个的代码展示出来,最后有完整的代码逻辑。
1、ndjson转json
var fs = require('fs');
var ndjson = require('ndjson'); // npm install ndjson
function parseSimplifiedDrawings(fileName, callback) {
var drawings = [];
var fileStream = fs.createReadStream(fileName)
fileStream
.pipe(ndjson.parse())
.on('data', function (obj) {
drawings.push(obj)
})
.on("error", callback)
.on("end", function () {
callback(null, drawings)
});
}
function tojson(filename) { //airplane.ndjson
var list = filename.split(".")
parseSimplifiedDrawings("D:\\my_py\\data\\QuickDrawsimplified\\" + filename, function (err, drawings) {
if (err) return console.error(err);
drawings.forEach(function (d) {
// Do something with the drawing
console.log(d.key_id, d.countrycode);
})
console.log("# of drawings:", drawings);
var filename = "D:\\my_py\\data\\jsons\\"+list[0]+".json";//这里保存
fs.writeFileSync(filename, JSON.stringify(drawings));//这里保存
})
}
要运行上面的js文件,需要先安装nodejs,可以上网查教程。
需要调用tojson函数,传入ndjison文件的地址,这个函数就会把ndjison转换为json文件。别忘了改保存json的地址。
2、json转png
f = open("D:\\my_py\\data\\jsons\\"+ list[0] + ".json")
setting = json.load(f)
for j in range(0, 200): # 转化保存1000个图
for i in range(0, len(setting[j]['drawing'])):
x = setting[j]['drawing'][i][0]
y = setting[j]['drawing'][i][1]
f = interpolate.interp1d(x, y, kind="slinear") # 线性插值
pl.plot(x, y, 'k')
ax = pl.gca() # 一个猫的所有线条画一起
ax.xaxis.set_ticks_position('top') # convert x,没有ax这几句猫就反着了
ax.invert_yaxis()
pl.axis('off')
pl.savefig("D:\\my_py\\data\\images\\"+list[0]+"\\"+list[0]+"%d.png" % j) # 保存位置
pl.close() # 不关闭的话所有图都画一起了
f为打开的json文件,按照这个流程走,就可以转换成为png文件了。
完整代码
通过读List.txt里面的文件名,拼接成ndjson文件的路径,先转换成json,然后转换成img图片。一共有345类,每类1000张图片。这里我只转换了前200张。大概用了一个多小时。
imageTansform.py是将白底黑字的图片转换成黑底白字的图片,看需求而定。
json_to_imgs.py
import json
from scipy import interpolate
import pylab as pl
import execjs
import pandas as pd
import os
def js_from_file(file_name):
"""
读取js文件
:return:
"""
with open(file_name, 'r', encoding='UTF-8') as file:
result = file.read()
return result
if __name__ == '__main__':
with open("List.txt", "r") as f:
for line in f.readlines():
filename = line.strip('\n') # 去掉列表中每一个元素的换行符
list = filename.split('.')
# 编译加载js字符串
ndjson_to_json = execjs.compile(js_from_file('ndjson_to_json.js'))
try:
ndjson_to_json.call("tojson", filename)
except:
pass
f = open("D:\\my_py\\data\\jsons\\"+ list[0] + ".json") # json文件所在绝对路径
os.mkdir("D:\\my_py\\data\\images\\"+list[0])
setting = json.load(f)
for j in range(0, 200): # 转化保存1000个图
for i in range(0, len(setting[j]['drawing'])):
x = setting[j]['drawing'][i][0]
y = setting[j]['drawing'][i][1]
f = interpolate.interp1d(x, y, kind="slinear") # 线性插值
pl.plot(x, y, 'k')
ax = pl.gca() # 一个猫的所有线条画一起
ax.xaxis.set_ticks_position('top') # convert x,没有ax这几句猫就反着了
ax.invert_yaxis()
pl.axis('off')
pl.savefig("D:\\my_py\\data\\images\\"+list[0]+"\\"+list[0]+"%d.png" % j) # 保存位置
pl.close() # 不关闭的话所有图都画一起了
ndjson_to_json.js
var fs = require('fs');
var ndjson = require('ndjson'); // npm install ndjson
function parseSimplifiedDrawings(fileName, callback) {
var drawings = [];
var fileStream = fs.createReadStream(fileName)
fileStream
.pipe(ndjson.parse())
.on('data', function (obj) {
drawings.push(obj)
})
.on("error", callback)
.on("end", function () {
callback(null, drawings)
});
}
function tojson(filename) { //airplane.ndjson
var list = filename.split(".")
parseSimplifiedDrawings("D:\\my_py\\data\\QuickDrawsimplified\\" + filename, function (err, drawings) {
if (err) return console.error(err);
drawings.forEach(function (d) {
// Do something with the drawing
console.log(d.key_id, d.countrycode);
})
console.log("# of drawings:", drawings);
var filename = "D:\\my_py\\data\\jsons\\"+list[0]+".json";//这里保存
fs.writeFileSync(filename, JSON.stringify(drawings));//这里保存
})
}
List.txt
aircraft carrier.ndjson
airplane.ndjson
alarm clock.ndjson
ambulance.ndjson
angel.ndjson
animal migration.ndjson
ant.ndjson
anvil.ndjson
apple.ndjson
arm.ndjson
asparagus.ndjson
axe.ndjson
backpack.ndjson
banana.ndjson
bandage.ndjson
barn.ndjson
baseball bat.ndjson
baseball.ndjson
basket.ndjson
basketball.ndjson
bat.ndjson
bathtub.ndjson
beach.ndjson
bear.ndjson
beard.ndjson
bed.ndjson
bee.ndjson
belt.ndjson
bench.ndjson
bicycle.ndjson
binoculars.ndjson
bird.ndjson
birthday cake.ndjson
blackberry.ndjson
blueberry.ndjson
book.ndjson
boomerang.ndjson
bottlecap.ndjson
bowtie.ndjson
bracelet.ndjson
brain.ndjson
bread.ndjson
bridge.ndjson
broccoli.ndjson
broom.ndjson
bucket.ndjson
bulldozer.ndjson
bus.ndjson
bush.ndjson
butterfly.ndjson
cactus.ndjson
cake.ndjson
calculator.ndjson
calendar.ndjson
camel.ndjson
camera.ndjson
camouflage.ndjson
campfire.ndjson
candle.ndjson
cannon.ndjson
canoe.ndjson
car.ndjson
carrot.ndjson
castle.ndjson
cat.ndjson
ceiling fan.ndjson
cell phone.ndjson
cello.ndjson
chair.ndjson
chandelier.ndjson
church.ndjson
circle.ndjson
clarinet.ndjson
clock.ndjson
cloud.ndjson
coffee cup.ndjson
compass.ndjson
computer.ndjson
cookie.ndjson
cooler.ndjson
couch.ndjson
cow.ndjson
crab.ndjson
crayon.ndjson
crocodile.ndjson
crown.ndjson
cruise ship.ndjson
cup.ndjson
diamond.ndjson
dishwasher.ndjson
diving board.ndjson
dog.ndjson
dolphin.ndjson
donut.ndjson
door.ndjson
dragon.ndjson
dresser.ndjson
drill.ndjson
drums.ndjson
duck.ndjson
dumbbell.ndjson
ear.ndjson
elbow.ndjson
elephant.ndjson
envelope.ndjson
eraser.ndjson
eye.ndjson
eyeglasses.ndjson
face.ndjson
fan.ndjson
feather.ndjson
fence.ndjson
finger.ndjson
fire hydrant.ndjson
fireplace.ndjson
firetruck.ndjson
fish.ndjson
flamingo.ndjson
flashlight.ndjson
flip flops.ndjson
floor lamp.ndjson
flower.ndjson
flying saucer.ndjson
foot.ndjson
fork.ndjson
frog.ndjson
frying pan.ndjson
garden hose.ndjson
garden.ndjson
giraffe.ndjson
goatee.ndjson
golf club.ndjson
grapes.ndjson
grass.ndjson
guitar.ndjson
hamburger.ndjson
hammer.ndjson
hand.ndjson
harp.ndjson
hat.ndjson
headphones.ndjson
hedgehog.ndjson
helicopter.ndjson
helmet.ndjson
hexagon.ndjson
hockey puck.ndjson
hockey stick.ndjson
horse.ndjson
hospital.ndjson
hot air balloon.ndjson
hot dog.ndjson
hot tub.ndjson
hourglass.ndjson
house plant.ndjson
house.ndjson
hurricane.ndjson
ice cream.ndjson
jacket.ndjson
jail.ndjson
kangaroo.ndjson
key.ndjson
keyboard.ndjson
knee.ndjson
knife.ndjson
ladder.ndjson
*.ndjson
laptop.ndjson
leaf.ndjson
leg.ndjson
light bulb.ndjson
lighter.ndjson
lighthouse.ndjson
lightning.ndjson
line.ndjson
lion.ndjson
lipstick.ndjson
lobster.ndjson
lollipop.ndjson
mailbox.ndjson
map.ndjson
marker.ndjson
matches.ndjson
megaphone.ndjson
mermaid.ndjson
microphone.ndjson
microwave.ndjson
monkey.ndjson
moon.ndjson
mosquito.ndjson
motorbike.ndjson
mountain.ndjson
mouse.ndjson
moustache.ndjson
mouth.ndjson
mug.ndjson
mushroom.ndjson
nail.ndjson
necklace.ndjson
nose.ndjson
ocean.ndjson
octagon.ndjson
octopus.ndjson
onion.ndjson
oven.ndjson
owl.ndjson
paint can.ndjson
paintbrush.ndjson
palm tree.ndjson
panda.ndjson
pants.ndjson
paper clip.ndjson
parachute.ndjson
parrot.ndjson
passport.ndjson
peanut.ndjson
pear.ndjson
peas.ndjson
pencil.ndjson
penguin.ndjson
piano.ndjson
pickup truck.ndjson
picture frame.ndjson
pig.ndjson
pillow.ndjson
pineapple.ndjson
pizza.ndjson
pliers.ndjson
police car.ndjson
pond.ndjson
pool.ndjson
popsicle.ndjson
postcard.ndjson
potato.ndjson
power outlet.ndjson
purse.ndjson
rabbit.ndjson
raccoon.ndjson
radio.ndjson
rain.ndjson
rainbow.ndjson
rake.ndjson
remote control.ndjson
rhinoceros.ndjson
rifle.ndjson
river.ndjson
roller coaster.ndjson
rollerskates.ndjson
sailboat.ndjson
sandwich.ndjson
saw.ndjson
saxophone.ndjson
school bus.ndjson
scissors.ndjson
scorpion.ndjson
screwdriver.ndjson
sea turtle.ndjson
see saw.ndjson
shark.ndjson
sheep.ndjson
shoe.ndjson
shorts.ndjson
shovel.ndjson
sink.ndjson
skateboard.ndjson
skull.ndjson
skyscraper.ndjson
sleeping bag.ndjson
smiley face.ndjson
snail.ndjson
snake.ndjson
snorkel.ndjson
snowflake.ndjson
snowman.ndjson
soccer ball.ndjson
sock.ndjson
speedboat.ndjson
spider.ndjson
spoon.ndjson
spreadsheet.ndjson
square.ndjson
squiggle.ndjson
squirrel.ndjson
stairs.ndjson
star.ndjson
steak.ndjson
stereo.ndjson
stethoscope.ndjson
stitches.ndjson
stop sign.ndjson
stove.ndjson
strawberry.ndjson
streetlight.ndjson
string bean.ndjson
submarine.ndjson
suitcase.ndjson
sun.ndjson
swan.ndjson
sweater.ndjson
swing set.ndjson
sword.ndjson
syringe.ndjson
t-shirt.ndjson
table.ndjson
teapot.ndjson
teddy-bear.ndjson
telephone.ndjson
television.ndjson
tennis racquet.ndjson
tent.ndjson
The Eiffel Tower.ndjson
The Great Wall of China.ndjson
The Mona Lisa.ndjson
tiger.ndjson
toaster.ndjson
toe.ndjson
toilet.ndjson
tooth.ndjson
toothbrush.ndjson
toothpaste.ndjson
tornado.ndjson
tractor.ndjson
traffic light.ndjson
train.ndjson
tree.ndjson
triangle.ndjson
trombone.ndjson
truck.ndjson
trumpet.ndjson
umbrella.ndjson
underwear.ndjson
van.ndjson
vase.ndjson
violin.ndjson
washing machine.ndjson
watermelon.ndjson
waterslide.ndjson
whale.ndjson
wheel.ndjson
windmill.ndjson
wine bottle.ndjson
wine glass.ndjson
wristwatch.ndjson
yoga.ndjson
zebra.ndjson
zigzag.ndjson
imageTansform.py
import os
from PIL import Image
def Convert(str):
"""
将图像中白色像素转变为黑色像素
"""
root = "D://my_py//data//image20//" + str
files = [f for f in os.listdir(root)]
for filename in files:
img = Image.open(root + '/' + filename)
img = img.convert("RGBA")
pixdata = img.load()
for y in range(img.size[1]):
for x in range(img.size[0]):
pixdata[x, y] = 255 - pixdata[x, y][0], \
255 - pixdata[x, y][1], \
255 - pixdata[x, y][2]
if not os.path.exists("D:\\my_py\\data\\image20_tra\\" + str):
os.mkdir("D:\\my_py\\data\\image20_tra\\" + str)
img.save("D:\\my_py\\data\\image20_tra\\" + str + "\\" + filename)
if __name__ == "__main__":
with open("List20.txt", "r") as f:
for line in f.readlines():
filename = line.strip('\n') # 去掉列表中每一个元素的换行符
line = filename.split('.')
Convert(str(line[0]))
pass
图片数据集
345类,每类200张图片
参考文献:
https://zhuanlan.zhihu.com/p/40903937https://zhuanlan.zhihu.com/p/40903937