Dataset之MNIST:MNIST(手写数字图片识别+ubyte.gz文件)数据集的下载(基于python语言根据爬虫技术自动下载MNIST数据集)

数据集下载的所有代码


代码打包地址:mnist数据集下载的完整代码https://download.csdn.net/download/qq_41185868/11449755

1、主文件 mnist_download_main.py文件


#1、读取数据集

# MNIST数据集大约12MB,如果没在指定的路径中找到就会自动下载。

from mnist import MNIST

data = MNIST(data_dir="data/MNIST/")  #它由70,000张图像和对应的标签(图像的类别)组成。数据集分成三份互相独立的子集。本教程中只用训练集和测试集。

print("Size of:")

print("- Training-set:\t\t{}".format(data.num_train))

print("- Validation-set:\t{}".format(data.num_val))

print("- Test-set:\t\t{}".format(data.num_test))



2、mnist.py文件


########################################################################

#

# Downloads the MNIST data-set for recognizing hand-written digits.

#

# Implemented in Python 3.6

#

# Usage:

# 1) Create a new object instance: data = MNIST(data_dir="data/MNIST/")

#    This automatically downloads the files to the given dir.

# 2) Use the training-set as data.x_train, data.y_train and data.y_train_cls

# 3) Get random batches of training data using data.random_batch()

# 4) Use the test-set as data.x_test, data.y_test and data.y_test_cls

#

########################################################################

#

# This file is part of the TensorFlow Tutorials available at:

#

# https://github.com/Hvass-Labs/TensorFlow-Tutorials

#

# Published under the MIT License. See the file LICENSE for details.

#

# Copyright 2016-18 by Magnus Erik Hvass Pedersen

#

########################################################################

import numpy as np

import gzip

import os

from dataset import one_hot_encoded

from download import download

########################################################################

# Base URL for downloading the data-files from the internet.

base_url = "https://storage.googleapis.com/cvdf-datasets/mnist/"

# Filenames for the data-set.

filename_x_train = "train-images-idx3-ubyte.gz"

filename_y_train = "train-labels-idx1-ubyte.gz"

filename_x_test = "t10k-images-idx3-ubyte.gz"

filename_y_test = "t10k-labels-idx1-ubyte.gz"

########################################################################

class MNIST:

   """

   The MNIST data-set for recognizing hand-written digits.

   This automatically downloads the data-files if they do

   not already exist in the local data_dir.

   Note: Pixel-values are floats between 0.0 and 1.0.

   """

   # The images are 28 pixels in each dimension.

   img_size = 28

   # The images are stored in one-dimensional arrays of this length.

   img_size_flat = img_size * img_size

   # Tuple with height and width of images used to reshape arrays.

   img_shape = (img_size, img_size)

   # Number of colour channels for the images: 1 channel for gray-scale.

   num_channels = 1

   # Tuple with height, width and depth used to reshape arrays.

   # This is used for reshaping in Keras.

   img_shape_full = (img_size, img_size, num_channels)

   # Number of classes, one class for each of 10 digits.

   num_classes = 10

   def __init__(self, data_dir="data/MNIST/"):

       """

       Load the MNIST data-set. Automatically downloads the files

       if they do not already exist locally.

       :param data_dir: Base-directory for downloading files.

       """

       # Copy args to self.

       self.data_dir = data_dir

       # Number of images in each sub-set.

       self.num_train = 55000

       self.num_val = 5000

       self.num_test = 10000

       # Download / load the training-set.

       x_train = self._load_images(filename=filename_x_train)

       y_train_cls = self._load_cls(filename=filename_y_train)

       # Split the training-set into train / validation.

       # Pixel-values are converted from ints between 0 and 255

       # to floats between 0.0 and 1.0.

       self.x_train = x_train[0:self.num_train] / 255.0

       self.x_val = x_train[self.num_train:] / 255.0

       self.y_train_cls = y_train_cls[0:self.num_train]

       self.y_val_cls = y_train_cls[self.num_train:]

       # Download / load the test-set.

       self.x_test = self._load_images(filename=filename_x_test) / 255.0

       self.y_test_cls = self._load_cls(filename=filename_y_test)

       # Convert the class-numbers from bytes to ints as that is needed

       # some places in TensorFlow.

       self.y_train_cls = self.y_train_cls.astype(np.int)

       self.y_val_cls = self.y_val_cls.astype(np.int)

       self.y_test_cls = self.y_test_cls.astype(np.int)

       # Convert the integer class-numbers into one-hot encoded arrays.

       self.y_train = one_hot_encoded(class_numbers=self.y_train_cls,

                                      num_classes=self.num_classes)

       self.y_val = one_hot_encoded(class_numbers=self.y_val_cls,

                                    num_classes=self.num_classes)

       self.y_test = one_hot_encoded(class_numbers=self.y_test_cls,

                                     num_classes=self.num_classes)

   def _load_data(self, filename, offset):

       """

       Load the data in the given file. Automatically downloads the file

       if it does not already exist in the data_dir.

       :param filename: Name of the data-file.

       :param offset: Start offset in bytes when reading the data-file.

       :return: The data as a numpy array.

       """

       # Download the file from the internet if it does not exist locally.

       download(base_url=base_url, filename=filename, download_dir=self.data_dir)

       # Read the data-file.

       path = os.path.join(self.data_dir, filename)

       with gzip.open(path, 'rb') as f:

           data = np.frombuffer(f.read(), np.uint8, offset=offset)

       return data

   def _load_images(self, filename):

       """

       Load image-data from the given file.

       Automatically downloads the file if it does not exist locally.

       :param filename: Name of the data-file.

       :return: Numpy array.

       """

       # Read the data as one long array of bytes.

       data = self._load_data(filename=filename, offset=16)

       # Reshape to 2-dim array with shape (num_images, img_size_flat).

       images_flat = data.reshape(-1, self.img_size_flat)

       return images_flat

   def _load_cls(self, filename):

       """

       Load class-numbers from the given file.

       Automatically downloads the file if it does not exist locally.

       :param filename: Name of the data-file.

       :return: Numpy array.

       """

       return self._load_data(filename=filename, offset=8)

   def random_batch(self, batch_size=32):

       """

       Create a random batch of training-data.

       :param batch_size: Number of images in the batch.

       :return: 3 numpy arrays (x, y, y_cls)

       """

       # Create a random index into the training-set.

       idx = np.random.randint(low=0, high=self.num_train, size=batch_size)

       # Use the index to lookup random training-data.

       x_batch = self.x_train[idx]

       y_batch = self.y_train[idx]

       y_batch_cls = self.y_train_cls[idx]

       return x_batch, y_batch, y_batch_cls

########################################################################


3、dataset.py文件


########################################################################

#

# Class for creating a data-set consisting of all files in a directory.

#

# Example usage is shown in the file knifey.py and Tutorial #09.

#

# Implemented in Python 3.5

#

########################################################################

#

# This file is part of the TensorFlow Tutorials available at:

#

# https://github.com/Hvass-Labs/TensorFlow-Tutorials

#

# Published under the MIT License. See the file LICENSE for details.

#

# Copyright 2016 by Magnus Erik Hvass Pedersen

#

########################################################################

import numpy as np

import os

import shutil

from cache import cache

########################################################################

def one_hot_encoded(class_numbers, num_classes=None):

   """

   Generate the One-Hot encoded class-labels from an array of integers.

   For example, if class_number=2 and num_classes=4 then

   the one-hot encoded label is the float array: [0. 0. 1. 0.]

   :param class_numbers:

       Array of integers with class-numbers.

       Assume the integers are from zero to num_classes-1 inclusive.

   :param num_classes:

       Number of classes. If None then use max(class_numbers)+1.

   :return:

       2-dim array of shape: [len(class_numbers), num_classes]

   """

   # Find the number of classes if None is provided.

   # Assumes the lowest class-number is zero.

   if num_classes is None:

       num_classes = np.max(class_numbers) + 1

   return np.eye(num_classes, dtype=float)[class_numbers]

########################################################################

class DataSet:

   def __init__(self, in_dir, exts='.jpg'):

       """

       Create a data-set consisting of the filenames in the given directory

       and sub-dirs that match the given filename-extensions.

       For example, the knifey-spoony data-set (see knifey.py) has the

       following dir-structure:

       knifey-spoony/forky/

       knifey-spoony/knifey/

       knifey-spoony/spoony/

       knifey-spoony/forky/test/

       knifey-spoony/knifey/test/

       knifey-spoony/spoony/test/

       This means there are 3 classes called: forky, knifey, and spoony.

       If we set in_dir = "knifey-spoony/" and create a new DataSet-object

       then it will scan through these directories and create a training-set

       and test-set for each of these classes.

       The training-set will contain a list of all the *.jpg filenames

       in the following directories:

       knifey-spoony/forky/

       knifey-spoony/knifey/

       knifey-spoony/spoony/

       The test-set will contain a list of all the *.jpg filenames

       in the following directories:

       knifey-spoony/forky/test/

       knifey-spoony/knifey/test/

       knifey-spoony/spoony/test/

       See the TensorFlow Tutorial #09 for a usage example.

       :param in_dir:

           Root-dir for the files in the data-set.

           This would be 'knifey-spoony/' in the example above.

       :param exts:

           String or tuple of strings with valid filename-extensions.

           Not case-sensitive.

       :return:

           Object instance.

       """

       # Extend the input directory to the full path.

       in_dir = os.path.abspath(in_dir)

       # Input directory.

       self.in_dir = in_dir

       # Convert all file-extensions to lower-case.

       self.exts = tuple(ext.lower() for ext in exts)

       # Names for the classes.

       self.class_names = []

       # Filenames for all the files in the training-set.

       self.filenames = []

       # Filenames for all the files in the test-set.

       self.filenames_test = []

       # Class-number for each file in the training-set.

       self.class_numbers = []

       # Class-number for each file in the test-set.

       self.class_numbers_test = []

       # Total number of classes in the data-set.

       self.num_classes = 0

       # For all files/dirs in the input directory.

       for name in os.listdir(in_dir):

           # Full path for the file / dir.

           current_dir = os.path.join(in_dir, name)

           # If it is a directory.

           if os.path.isdir(current_dir):

               # Add the dir-name to the list of class-names.

               self.class_names.append(name)

               # Training-set.

               # Get all the valid filenames in the dir (not sub-dirs).

               filenames = self._get_filenames(current_dir)

               # Append them to the list of all filenames for the training-set.

               self.filenames.extend(filenames)

               # The class-number for this class.

               class_number = self.num_classes

               # Create an array of class-numbers.

               class_numbers = [class_number] * len(filenames)

               # Append them to the list of all class-numbers for the training-set.

               self.class_numbers.extend(class_numbers)

               # Test-set.

               # Get all the valid filenames in the sub-dir named 'test'.

               filenames_test = self._get_filenames(os.path.join(current_dir, 'test'))

               # Append them to the list of all filenames for the test-set.

               self.filenames_test.extend(filenames_test)

               # Create an array of class-numbers.

               class_numbers = [class_number] * len(filenames_test)

               # Append them to the list of all class-numbers for the test-set.

               self.class_numbers_test.extend(class_numbers)

               # Increase the total number of classes in the data-set.

               self.num_classes += 1

   def _get_filenames(self, dir):

       """

       Create and return a list of filenames with matching extensions in the given directory.

       :param dir:

           Directory to scan for files. Sub-dirs are not scanned.

       :return:

           List of filenames. Only filenames. Does not include the directory.

       """

       # Initialize empty list.

       filenames = []

       # If the directory exists.

       if os.path.exists(dir):

           # Get all the filenames with matching extensions.

           for filename in os.listdir(dir):

               if filename.lower().endswith(self.exts):

                   filenames.append(filename)

       return filenames

   def get_paths(self, test=False):

       """

       Get the full paths for the files in the data-set.

       :param test:

           Boolean. Return the paths for the test-set (True) or training-set (False).

       :return:

           Iterator with strings for the path-names.

       """

       if test:

           # Use the filenames and class-numbers for the test-set.

           filenames = self.filenames_test

           class_numbers = self.class_numbers_test

           # Sub-dir for test-set.

           test_dir = "test/"

       else:

           # Use the filenames and class-numbers for the training-set.

           filenames = self.filenames

           class_numbers = self.class_numbers

           # Don't use a sub-dir for test-set.

           test_dir = ""

       for filename, cls in zip(filenames, class_numbers):

           # Full path-name for the file.

           path = os.path.join(self.in_dir, self.class_names[cls], test_dir, filename)

           yield path

   def get_training_set(self):

       """

       Return the list of paths for the files in the training-set,

       and the list of class-numbers as integers,

       and the class-numbers as one-hot encoded arrays.

       """

       return list(self.get_paths()), \

              np.asarray(self.class_numbers), \

              one_hot_encoded(class_numbers=self.class_numbers,

                              num_classes=self.num_classes)

   def get_test_set(self):

       """

       Return the list of paths for the files in the test-set,

       and the list of class-numbers as integers,

       and the class-numbers as one-hot encoded arrays.

       """

       return list(self.get_paths(test=True)), \

              np.asarray(self.class_numbers_test), \

              one_hot_encoded(class_numbers=self.class_numbers_test,

                              num_classes=self.num_classes)

   def copy_files(self, train_dir, test_dir):

       """

       Copy all the files in the training-set to train_dir

       and copy all the files in the test-set to test_dir.

       For example, the normal directory structure for the

       different classes in the training-set is:

       knifey-spoony/forky/

       knifey-spoony/knifey/

       knifey-spoony/spoony/

       Normally the test-set is a sub-dir of the training-set:

       knifey-spoony/forky/test/

       knifey-spoony/knifey/test/

       knifey-spoony/spoony/test/

       But some APIs use another dir-structure for the training-set:

       

       knifey-spoony/train/forky/

       knifey-spoony/train/knifey/

       knifey-spoony/train/spoony/

       and for the test-set:

       

       knifey-spoony/test/forky/

       knifey-spoony/test/knifey/

       knifey-spoony/test/spoony/

       :param train_dir: Directory for the training-set e.g. 'knifey-spoony/train/'

       :param test_dir: Directory for the test-set e.g. 'knifey-spoony/test/'

       :return: Nothing.

       """

       # Helper-function for actually copying the files.

       def _copy_files(src_paths, dst_dir, class_numbers):

           # Create a list of dirs for each class, e.g.:

           # ['knifey-spoony/test/forky/',

           #  'knifey-spoony/test/knifey/',

           #  'knifey-spoony/test/spoony/']

           class_dirs = [os.path.join(dst_dir, class_name + "/")

                         for class_name in self.class_names]

           # Check if each class-directory exists, otherwise create it.

           for dir in class_dirs:

               if not os.path.exists(dir):

                   os.makedirs(dir)

           # For all the file-paths and associated class-numbers,

           # copy the file to the destination dir for that class.

           for src, cls in zip(src_paths, class_numbers):

               shutil.copy(src=src, dst=class_dirs[cls])

       # Copy the files for the training-set.

       _copy_files(src_paths=self.get_paths(test=False),

                   dst_dir=train_dir,

                   class_numbers=self.class_numbers)

       print("- Copied training-set to:", train_dir)

       # Copy the files for the test-set.

       _copy_files(src_paths=self.get_paths(test=True),

                   dst_dir=test_dir,

                   class_numbers=self.class_numbers_test)

       print("- Copied test-set to:", test_dir)

########################################################################

def load_cached(cache_path, in_dir):

   """

   Wrapper-function for creating a DataSet-object, which will be

   loaded from a cache-file if it already exists, otherwise a new

   object will be created and saved to the cache-file.

   This is useful if you need to ensure the ordering of the

   filenames is consistent every time you load the data-set,

   for example if you use the DataSet-object in combination

   with Transfer Values saved to another cache-file, see e.g.

   Tutorial #09 for an example of this.

   :param cache_path:

       File-path for the cache-file.

   :param in_dir:

       Root-dir for the files in the data-set.

       This is an argument for the DataSet-init function.

   :return:

       The DataSet-object.

   """

   print("Creating dataset from the files in: " + in_dir)

   # If the object-instance for DataSet(in_dir=data_dir) already

   # exists in the cache-file then reload it, otherwise create

   # an object instance and save it to the cache-file for next time.

   dataset = cache(cache_path=cache_path,

                   fn=DataSet, in_dir=in_dir)

   return dataset

########################################################################


4、cache.py


########################################################################

#

# Cache-wrapper for a function or class.

#

# Save the result of calling a function or creating an object-instance

# to harddisk. This is used to persist the data so it can be reloaded

# very quickly and easily.

#

# Implemented in Python 3.5

#

########################################################################

#

# This file is part of the TensorFlow Tutorials available at:

#

# https://github.com/Hvass-Labs/TensorFlow-Tutorials

#

# Published under the MIT License. See the file LICENSE for details.

#

# Copyright 2016 by Magnus Erik Hvass Pedersen

#

########################################################################

import os

import pickle

import numpy as np

########################################################################

def cache(cache_path, fn, *args, **kwargs):

   """

   Cache-wrapper for a function or class. If the cache-file exists

   then the data is reloaded and returned, otherwise the function

   is called and the result is saved to cache. The fn-argument can

   also be a class instead, in which case an object-instance is

   created and saved to the cache-file.

   :param cache_path:

       File-path for the cache-file.

   :param fn:

       Function or class to be called.

   :param args:

       Arguments to the function or class-init.

   :param kwargs:

       Keyword arguments to the function or class-init.

   :return:

       The result of calling the function or creating the object-instance.

   """

   # If the cache-file exists.

   if os.path.exists(cache_path):

       # Load the cached data from the file.

       with open(cache_path, mode='rb') as file:

           obj = pickle.load(file)

       print("- Data loaded from cache-file: " + cache_path)

   else:

       # The cache-file does not exist.

       # Call the function / class-init with the supplied arguments.

       obj = fn(*args, **kwargs)

       # Save the data to a cache-file.

       with open(cache_path, mode='wb') as file:

           pickle.dump(obj, file)

       print("- Data saved to cache-file: " + cache_path)

   return obj

########################################################################

def convert_numpy2pickle(in_path, out_path):

   """

   Convert a numpy-file to pickle-file.

   The first version of the cache-function used numpy for saving the data.

   Instead of re-calculating all the data, you can just convert the

   cache-file using this function.

   :param in_path:

       Input file in numpy-format written using numpy.save().

   :param out_path:

       Output file written as a pickle-file.

   :return:

       Nothing.

   """

   # Load the data using numpy.

   data = np.load(in_path)

   # Save the data using pickle.

   with open(out_path, mode='wb') as file:

       pickle.dump(data, file)

########################################################################

if __name__ == '__main__':

   # This is a short example of using a cache-file.

   # This is the function that will only get called if the result

   # is not already saved in the cache-file. This would normally

   # be a function that takes a long time to compute, or if you

   # need persistent data for some other reason.

   def expensive_function(a, b):

       return a * b

   print('Computing expensive_function() ...')

   # Either load the result from a cache-file if it already exists,

   # otherwise calculate expensive_function(a=123, b=456) and

   # save the result to the cache-file for next time.

   result = cache(cache_path='cache_expensive_function.pkl',

                  fn=expensive_function, a=123, b=456)

   print('result =', result)

   # Newline.

   print()

   # This is another example which saves an object to a cache-file.

   # We want to cache an object-instance of this class.

   # The motivation is to do an expensive computation only once,

   # or if we need to persist the data for some other reason.

   class ExpensiveClass:

       def __init__(self, c, d):

           self.c = c

           self.d = d

           self.result = c * d

       def print_result(self):

           print('c =', self.c)

           print('d =', self.d)

           print('result = c * d =', self.result)

   print('Creating object from ExpensiveClass() ...')

   # Either load the object from a cache-file if it already exists,

   # otherwise make an object-instance ExpensiveClass(c=123, d=456)

   # and save the object to the cache-file for the next time.

   obj = cache(cache_path='cache_ExpensiveClass.pkl',

               fn=ExpensiveClass, c=123, d=456)

   obj.print_result()

########################################################################


5、download.py文件


########################################################################

#

# Functions for downloading and extracting data-files from the internet.

#

# Implemented in Python 3.5

#

########################################################################

#

# This file is part of the TensorFlow Tutorials available at:

#

# https://github.com/Hvass-Labs/TensorFlow-Tutorials

#

# Published under the MIT License. See the file LICENSE for details.

#

# Copyright 2016 by Magnus Erik Hvass Pedersen

#

########################################################################

import sys

import os

import urllib.request

import tarfile

import zipfile

########################################################################

def _print_download_progress(count, block_size, total_size):

   """

   Function used for printing the download progress.

   Used as a call-back function in maybe_download_and_extract().

   """

   # Percentage completion.

   pct_complete = float(count * block_size) / total_size

   # Limit it because rounding errors may cause it to exceed 100%.

   pct_complete = min(1.0, pct_complete)

   # Status-message. Note the \r which means the line should overwrite itself.

   msg = "\r- Download progress: {0:.1%}".format(pct_complete)

   # Print it.

   sys.stdout.write(msg)

   sys.stdout.flush()

########################################################################

def download(base_url, filename, download_dir):

   """

   Download the given file if it does not already exist in the download_dir.

   :param base_url: The internet URL without the filename.

   :param filename: The filename that will be added to the base_url.

   :param download_dir: Local directory for storing the file.

   :return: Nothing.

   """

   # Path for local file.

   save_path = os.path.join(download_dir, filename)

   # Check if the file already exists, otherwise we need to download it now.

   if not os.path.exists(save_path):

       # Check if the download directory exists, otherwise create it.

       if not os.path.exists(download_dir):

           os.makedirs(download_dir)

       print("Downloading", filename, "...")

       # Download the file from the internet.

       url = base_url + filename

       file_path, _ = urllib.request.urlretrieve(url=url,

                                                 filename=save_path,

                                                 reporthook=_print_download_progress)

       print(" Done!")

def maybe_download_and_extract(url, download_dir):

   """

   Download and extract the data if it doesn't already exist.

   Assumes the url is a tar-ball file.

   :param url:

       Internet URL for the tar-file to download.

       Example: "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"

   :param download_dir:

       Directory where the downloaded file is saved.

       Example: "data/CIFAR-10/"

   :return:

       Nothing.

   """

   # Filename for saving the file downloaded from the internet.

   # Use the filename from the URL and add it to the download_dir.

   filename = url.split('/')[-1]

   file_path = os.path.join(download_dir, filename)

   # Check if the file already exists.

   # If it exists then we assume it has also been extracted,

   # otherwise we need to download and extract it now.

   if not os.path.exists(file_path):

       # Check if the download directory exists, otherwise create it.

       if not os.path.exists(download_dir):

           os.makedirs(download_dir)

       # Download the file from the internet.

       file_path, _ = urllib.request.urlretrieve(url=url,

                                                 filename=file_path,

                                                 reporthook=_print_download_progress)

       print()

       print("Download finished. Extracting files.")

       if file_path.endswith(".zip"):

           # Unpack the zip-file.

           zipfile.ZipFile(file=file_path, mode="r").extractall(download_dir)

       elif file_path.endswith((".tar.gz", ".tgz")):

           # Unpack the tar-ball.

           tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)

       print("Done.")

   else:

       print("Data has apparently already been downloaded and unpacked.")

########################################################################



上一篇:JVM18_CMS低延迟垃圾收集器、概述、原理、优缺点、参数设置、三色标记、ASTB 和 Incremental Update、记忆集与卡表(一)


下一篇:android:universal:文字and图片and屏幕适配