fromsrc.configs.project_configsimportCLASS_DESCRIPTION# information on each class, including descriptive class name and diegnostic description - used to help wiht the project
fromsrc.configs.tfhub_configsimportTFHUB_MODELS# names of TF hub modules that I presenlected for featuress extraction with all relevant info,
fromsrc.configs.dataset_configsimportDATASET_CONFIGS# names created for clases, assigned to original one, and colors assigned to these classes
fromsrc.configs.dataset_configsimportCLASS_LABELS_CONFIGS# names created for clases, assigned to original one, and colors assigned to these classes
fromsrc.configs.dataset_configsimportDROPOUT_VALUE# str, special value to indicate samples to remoce in class labels
fromsrc.configs.config_functionsimportDEFINE_DATASETS# function that creates datasunbsets collections for one dataset (custome made for that project)
one_module_full_path=one_module_file_name# here I am using module url, (no path)
# extract features
encode_images(
# .. dastaset name & directories,
dataset_name=f"{DATASET_NAME}__{dataset_variant}",# name used when saving encoded files, logfiles and other things, related to encoding,
subset_names=subset_names_to_encode,# list, ust names of files in the load_dir, if any,
load_dir=os.path.join(PATH_interim,f"{DATASET_NAME}__{dataset_variant}"),# full path to input data, ie. file folder with either folders with images names after class names, or folders with subsetnames, and folders names after each class in them,
save_dir=os.path.join(PATH_interim,f"{DATASET_NAME}__{dataset_variant}__extracted_beta_features"),# all new files, will be saved as one batch, with logfile, if None, load_dir will be used,
# .. encoding module parameters,
module_name=one_module_name,# name used when saving files
module_location=one_module_full_path,# full path to a given module, or url,
img_target_size=one_img_input_size,# image resolution in pixels,
generator_batch_size=generator_batch_size,# must be larger or equal to in size of the largest subset
generator_shuffle=False,
# .. other,
save_files=True,
verbose=False
)
```
%%%% Output: stream
- 0 - Extracting features from: cancer_detection_and_classification
................................................
- 0/0 module: MobileNet_v2
- 0/0 filename or url: https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/2
UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7faab04b9fb0>
%% Cell type:code id: tags:
``` python
```
%%Celltype:codeid:tags:
``` python
import os # allow changing, and navigating files and folders,
import sys
import re # module to use regular expressions,
import glob # lists names in folders that match Unix shell patterns
import random # functions that use and generate random numbers
import numpy as np # support for multi-dimensional arrays and matrices
import pandas as pd # library for data manipulation and analysis
import seaborn as sns # advance plots, for statistics,
import matplotlib.pyplot as plt # for making plots,
import matplotlib as mpl # to get some basif functions, heping with plot mnaking
import tensorflow as tf
import tensorflow_hub as hub
import scipy.stats as stats # library for statistics and technical programming,
import tensorflow.keras as keras
from PIL import Image, ImageDraw
from IPython.display import display
from tensorflow.keras import backend as K # used for housekeeping of tf models,
import matplotlib.patches as mpatches
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from src.utils.image_augmentation import * # some basic tools create for image augmentastion using keras tools, here used for building batch_labels table,
dataset_name=None,# dataset name used when saving encoded files, logfiles and other things, related to encoding,
load_dir=None, # full path to input data, ie. file folder with either folders with images names after class names, or folders with subsetnames, and folders names after each class in them,
subset_names=None,# list, ust names of files in the load_dir, if any,
save_dir=None, # all new files, will be saved as one batch, with logfile, if None, load_dir will be used,
# .. encoding module parameters,
module_name=None,
module_location, # full path to a given module,
img_target_size, # image resolution in pixels,
generator_batch_size =1000,
generator_shuffle =False,
# .. other,
save_files=True,
verbose=True
):
"""
Function does the following:
- extracts features from rgb figures,
- saves each batch in npy file format, [img_nr, x_pixel, y_pixel, 3]
- saves csv file with info on each image in that file,
- creates log file with info on files with images that were encoded and
# Arguments
. datase_tname : str, an arbitrary name used at the beginning of all files created
. module_name : str eg: "mobilenet", used as above in each file create with that function,
. module_location : str, either ulr to tfhub module used for feature extraction,
or path to directory with locally saved module,
. load_dir : str, path to directory that contains the folder with folders containgn classes,
. save_dir : str, path to directory whther all files will be stored,
. folders_in_load_dir : list with strings, each string is an exact folder name, that contains images stored in folders with class names,
. img_target_size : tuple, (int, int), img (height, width) size in pixesl, Caution, this functions works only with RGB FILES,
thus, the final, batch dimensions are [?, height, width, 3]
. generator_batch_size : int, value for keras, ImageDataGenerator.flow_from_directory, batch size,
the generator will use only max available images, if less is in the dataset,
or only the first 10000 or other requested number, if more is available,
be carefull to set generator_shuffle = True, while using a subset of images to have all classes
represented,
. generator_shuffle : bool, value for keras, ImageDataGenerator.flow_from_directory,
. ...
. verbose : bool, default==False,
. save_files : bool, if True, encoded batch, logfile, and labels, will be saved ib save_dir,
if False, encoded batch and labels will be returned as dictionaries in list
with folders_in_load_dir as keys, no log file will be created,
# Returns
. log file : pd.dataFrame, with basic info on each dataset encoded, and names used to save files for it,
# saves
. log file : pd.dataFrame, saved in save_dir as .csv
. img_batch_info : pd.dataFrame, saved in save_dir as .csv
. img_batch_features : .npy with [(images found in a given set), feture_number], with encoded features from the given module,
# example - use case,
# ... from,
module_path = os.path.join(basedir,"models/imagenet_mobilenet_v2_100_224_feature_vector_2") # path to local directory,
module_path = 'https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/2' # as url
# load the module, by default i set TF 1.x models, and if they dont work, I assumed that you will load TF 2.X modules with hub.load() functions used as below
try:
feature_extractor = hub.Module(module_location, trainable=False) # tf1 modules, tahes a bit of time, to check if there is an error, but it was the fasted I could use with coding,
#print("Signatures: ",feature_extractor.get_signature_names()) # eg: ['default', 'image_feature_vector'] - signatures, can be used to extract subset of NN layers
one_module_full_path = one_module_file_name # here I am using module url, (no path)
# extract features
encode_images(
# .. dastaset name & directories,
dataset_name = f"{DATASET_NAME}__{dataset_variant}",# name used when saving encoded files, logfiles and other things, related to encoding,
subset_names = subset_names_to_encode,# list, ust names of files in the load_dir, if any,
load_dir = os.path.join(PATH_interim, f"{DATASET_NAME}__{dataset_variant}"), # full path to input data, ie. file folder with either folders with images names after class names, or folders with subsetnames, and folders names after each class in them,
save_dir = os.path.join(PATH_interim, f"{DATASET_NAME}__{dataset_variant}__extracted_beta_features"), # all new files, will be saved as one batch, with logfile, if None, load_dir will be used,
# .. encoding module parameters,
module_name = one_module_name, # name used when saving files
module_location = one_module_full_path, # full path to a given module, or url,
img_target_size = one_img_input_size, # image resolution in pixels,
generator_batch_size = generator_batch_size, # must be larger or equal to in size of the largest subset
generator_shuffle = False,
# .. other,
save_files = True,
verbose = False
)
```
%%%% Output: stream
- 0 - Extracting features from: cancer_detection_and_classification
................................................
- 0/0 module: MobileNet_v2
- 0/0 filename or url: https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/2