Commit 27974add authored by pawel rosikiewicz's avatar pawel rosikiewicz 💬
Browse files

small chnages

parent 5cb1f1dd
Pipeline #187623 passed with stage
in 23 seconds
%% Cell type:markdown id: tags:
# SkinAnaliticAI, Skin Cancer Detection with AI Deep Learning
## __Evaluation of Harvard Dataset with different AI classiffication techniques using FastClassAI papeline__
Author: __Pawel Rosikiewicz__
prosikiewicz@gmail.com
License: __MIT__
ttps://opensource.org/licenses/MIT
Copyright (C) 2021.01.30 Pawel Rosikiewicz
%% Cell type:markdown id: tags:
#### standard imports
%% Cell type:code id: tags:
``` python
import os # allow changing, and navigating files and folders,
import sys
import shutil
import re # module to use regular expressions,
import glob # lists names in folders that match Unix shell patterns
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from tensorflow.keras.preprocessing.image import ImageDataGenerator
```
%% Cell type:code id: tags:
``` python
# setup basedir
basedir = os.path.dirname(os.getcwd())
os.chdir(basedir)
sys.path.append(basedir)
# set up paths for the project
PATH_raw = os.path.join(basedir, "data/raw")
PATH_interim = os.path.join(basedir, "data/interim")
PATH_raw = os.path.join(basedir, "data_2/raw")
PATH_interim = os.path.join(basedir, "data_2/interim")
PATH_models = os.path.join(basedir, "models")
PATH_interim_dataset_summary_tables = os.path.join(PATH_interim, "dataset_summary_tables") # create in that notebook,
```
%% Cell type:code id: tags:
``` python
# load functions,
from src.utils.feature_extraction_tools import encode_images
# load configs
from src.configs.project_configs import CLASS_DESCRIPTION # information on each class, including descriptive class name and diegnostic description - used to help wiht the project
from src.configs.tfhub_configs import TFHUB_MODELS # names of TF hub modules that I presenlected for featuress extraction with all relevant info,
from src.configs.dataset_configs import DATASET_CONFIGS # names created for clases, assigned to original one, and colors assigned to these classes
from src.configs.dataset_configs import CLASS_LABELS_CONFIGS # names created for clases, assigned to original one, and colors assigned to these classes
from src.configs.dataset_configs import DROPOUT_VALUE # str, special value to indicate samples to remoce in class labels
from src.configs.config_functions import DEFINE_DATASETS # function that creates datasunbsets collections for one dataset (custome made for that project)
# set project variables
PROJECT_NAME = "SkinAnaliticAI_Harvard_dataset_evaluation" #
DATASET_NAME = "ham10000_workshop" # name used in config files to identify all info on that dataset variant
DATASET_VARIANTS = DATASET_CONFIGS[DATASET_NAME]["labels"] # class labels that will be used, SORT_FILES_WITH must be included
```
%% Cell type:markdown id: tags:
## FEATURE EXTRACTION
%% Cell type:code id: tags:
``` python
# preset values
generator_batch_size = 3000 # no more then 3000 images will be taken, but we expect no more then 2000 in that tassk.
use_url = "yes" # the script is adapted only to use sys.path, but configs carries url's and ulr can be used with feature extraction function
# extract features from images in each dataset varinat using one or more tf hub modules,
for dv_i, dataset_variant in enumerate(DATASET_VARIANTS):
print(f"\n- {dv_i} - Extracting features from: {dataset_variant}")
# find names off train/valid/test subsets in dataset folder,
os.chdir(os.path.join(PATH_interim, f"{DATASET_NAME}__{dataset_variant}"))
subset_names_to_encode = []
for file in glob.glob(f"[train|valid|test]*"):
subset_names_to_encode.append(file)
# Create lists with info required for feture extraction from images
'this step is super usefull when many models is used for feature extraction'
tfmodules = list(TFHUB_MODELS.keys()) # names of tf hub models used
module_names = [TFHUB_MODELS[x]['module_name'] for x in tfmodules]
#module_file_names = [TFHUB_MODELS[x]['file_name'] for x in tfmodules]
module_file_names = [TFHUB_MODELS[x]['module_url'] for x in tfmodules]
img_imput_size = [TFHUB_MODELS[x]['input_size'] for x in tfmodules]
# extract features, from images from each subset, and store them togther as one batch array,
for i, (one_module_name, one_module_file_name, one_img_input_size) in enumerate(zip(module_names, module_file_names, img_imput_size)):
'''
all data subsets found in load_dir will be encoded automatically,
- logfile will be created for a given datasets
- batch_labels csv file and npz file with encoded features will be created for
each data subset will have:
-
'''
print("\n ................................................")
print(f" - {dv_i}/{i} module: {one_module_name}")
print(f" - {dv_i}/{i} filename or url: {one_module_file_name}")
print(f" - {dv_i}/{i} RGB image size : {one_img_input_size}")
print(f" - {dv_i}/{i} datset subsets : {subset_names_to_encode}")
print(f" - Cataloging subsets, then extracting features from all images")
print(f" - Important: Each subset will be saved as one matrix")
print("\n")
# I am using modules saved in computer memory, thus I need to build fiull path to them,
if use_url=="no":
one_module_full_path = os.path.join(PATH_models, one_module_file_name)
else:
one_module_full_path = one_module_file_name # here I am using module url, (no path)
# extract features
encode_images(
# .. dastaset name & directories,
dataset_name = f"{DATASET_NAME}__{dataset_variant}",# name used when saving encoded files, logfiles and other things, related to encoding,
subset_names = subset_names_to_encode,# list, ust names of files in the load_dir, if any,
load_dir = os.path.join(PATH_interim, f"{DATASET_NAME}__{dataset_variant}"), # full path to input data, ie. file folder with either folders with images names after class names, or folders with subsetnames, and folders names after each class in them,
save_dir = os.path.join(PATH_interim, f"{DATASET_NAME}__{dataset_variant}__extracted_beta_features"), # all new files, will be saved as one batch, with logfile, if None, load_dir will be used,
# .. encoding module parameters,
module_name = one_module_name, # name used when saving files
module_location = one_module_full_path, # full path to a given module, or url,
img_target_size = one_img_input_size, # image resolution in pixels,
generator_batch_size = generator_batch_size, # must be larger or equal to in size of the largest subset
generator_shuffle = False,
# .. other,
save_files = True,
verbose = False
)
```
%%%% Output: stream
- 0 - Extracting features from: cancer_detection_and_classification
................................................
- 0/0 module: MobileNet_v2
- 0/0 filename or url: https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/2
- 0/0 RGB image size : (224, 224)
- 0/0 datset subsets : ['test_02', 'valid_01', 'train_01', 'test_01']
- Cataloging subsets, then extracting features from all images
- Important: Each subset will be saved as one matrix
Found 73 images belonging to 7 classes.
Found 145 images belonging to 7 classes.
Found 441 images belonging to 7 classes.
Found 73 images belonging to 7 classes.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
%%%% Output: stream
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
%%%% Output: error
---------------------------------------------------------------------------
UnidentifiedImageError Traceback (most recent call last)
<ipython-input-6-577e7ff79add> in <module>
66 # .. other,
67 save_files = True,
---> 68 verbose = False
69 )
/work/amld-2021-workshop/src/utils/feature_extraction_tools.py in encode_images(dataset_name, load_dir, subset_names, save_dir, module_name, module_location, img_target_size, generator_batch_size, generator_shuffle, save_files, verbose)
261 # img batch encoding, ...............................................
262 #.. Load batch,
--> 263 img_batch, img_labels = next(iterators_dct[setname])
264
265 #.. Extract features from the batch of images,
/opt/conda/lib/python3.7/site-packages/keras_preprocessing/image/iterator.py in __next__(self, *args, **kwargs)
102
103 def __next__(self, *args, **kwargs):
--> 104 return self.next(*args, **kwargs)
105
106 def next(self):
/opt/conda/lib/python3.7/site-packages/keras_preprocessing/image/iterator.py in next(self)
114 # The transformation of images is not under thread lock
115 # so it can be done in parallel
--> 116 return self._get_batches_of_transformed_samples(index_array)
117
118 def _get_batches_of_transformed_samples(self, index_array):
/opt/conda/lib/python3.7/site-packages/keras_preprocessing/image/iterator.py in _get_batches_of_transformed_samples(self, index_array)
228 color_mode=self.color_mode,
229 target_size=self.target_size,
--> 230 interpolation=self.interpolation)
231 x = img_to_array(img, data_format=self.data_format)
232 # Pillow images should be closed after `load_img`,
/opt/conda/lib/python3.7/site-packages/keras_preprocessing/image/utils.py in load_img(path, grayscale, color_mode, target_size, interpolation)
112 'The use of `load_img` requires PIL.')
113 with open(path, 'rb') as f:
--> 114 img = pil_image.open(io.BytesIO(f.read()))
115 if color_mode == 'grayscale':
116 # if image is not already an 8-bit, 16-bit or 32-bit grayscale image
/opt/conda/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode, formats)
2966 warnings.warn(message)
2967 raise UnidentifiedImageError(
-> 2968 "cannot identify image file %r" % (filename if filename else fp)
2969 )
2970
UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7faab04b9fb0>
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
import os # allow changing, and navigating files and folders,
import sys
import re # module to use regular expressions,
import glob # lists names in folders that match Unix shell patterns
import random # functions that use and generate random numbers
import numpy as np # support for multi-dimensional arrays and matrices
import pandas as pd # library for data manipulation and analysis
import seaborn as sns # advance plots, for statistics,
import matplotlib.pyplot as plt # for making plots,
import matplotlib as mpl # to get some basif functions, heping with plot mnaking
import tensorflow as tf
import tensorflow_hub as hub
import scipy.stats as stats # library for statistics and technical programming,
import tensorflow.keras as keras
from PIL import Image, ImageDraw
from IPython.display import display
from tensorflow.keras import backend as K # used for housekeeping of tf models,
import matplotlib.patches as mpatches
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from src.utils.image_augmentation import * # some basic tools create for image augmentastion using keras tools, here used for building batch_labels table,
# Function, .................................................................................................................
def encode_images(*,
# .. dastaset name & directories,
dataset_name=None,# dataset name used when saving encoded files, logfiles and other things, related to encoding,
load_dir=None, # full path to input data, ie. file folder with either folders with images names after class names, or folders with subsetnames, and folders names after each class in them,
subset_names=None,# list, ust names of files in the load_dir, if any,
save_dir=None, # all new files, will be saved as one batch, with logfile, if None, load_dir will be used,
# .. encoding module parameters,
module_name=None,
module_location, # full path to a given module,
img_target_size, # image resolution in pixels,
generator_batch_size =1000,
generator_shuffle =False,
# .. other,
save_files=True,
verbose=True
):
"""
Function does the following:
- extracts features from rgb figures,
- saves each batch in npy file format, [img_nr, x_pixel, y_pixel, 3]
- saves csv file with info on each image in that file,
- creates log file with info on files with images that were encoded and
# Arguments
. datase_tname : str, an arbitrary name used at the beginning of all files created
. module_name : str eg: "mobilenet", used as above in each file create with that function,
. module_location : str, either ulr to tfhub module used for feature extraction,
or path to directory with locally saved module,
. load_dir : str, path to directory that contains the folder with folders containgn classes,
. save_dir : str, path to directory whther all files will be stored,
. folders_in_load_dir : list with strings, each string is an exact folder name, that contains images stored in folders with class names,
. img_target_size : tuple, (int, int), img (height, width) size in pixesl, Caution, this functions works only with RGB FILES,
thus, the final, batch dimensions are [?, height, width, 3]
. generator_batch_size : int, value for keras, ImageDataGenerator.flow_from_directory, batch size,
the generator will use only max available images, if less is in the dataset,
or only the first 10000 or other requested number, if more is available,
be carefull to set generator_shuffle = True, while using a subset of images to have all classes
represented,
. generator_shuffle : bool, value for keras, ImageDataGenerator.flow_from_directory,
. ...
. verbose : bool, default==False,
. save_files : bool, if True, encoded batch, logfile, and labels, will be saved ib save_dir,
if False, encoded batch and labels will be returned as dictionaries in list
with folders_in_load_dir as keys, no log file will be created,
# Returns
. log file : pd.dataFrame, with basic info on each dataset encoded, and names used to save files for it,
# saves
. log file : pd.dataFrame, saved in save_dir as .csv
. img_batch_info : pd.dataFrame, saved in save_dir as .csv
. img_batch_features : .npy with [(images found in a given set), feture_number], with encoded features from the given module,
# example - use case,
# ... from,
module_path = os.path.join(basedir,"models/imagenet_mobilenet_v2_100_224_feature_vector_2") # path to local directory,
module_path = 'https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/2' # as url
# ...
module_name = "mobilenet"
module_img_imput_size = (224, 224)
# ....
encodedimageges_dct, lables_dct = encode_images_with_tfhubmodule(
datasetname="augmented_swissroads",
module_name=module_name,
module_location=module_path,
load_dir= os.path.join(basedir, "data/interim/augmented_swissroads/"),
folders_in_load_dir=["test"],
save_dir=os.path.join(basedir, "data/interim/augmented_swissroads/"),
img_target_size=module_img_imput_size,
save_files=True,
verbose=True
)
"""
# ...........................................................................
# set up,
if load_dir==None:
load_dir = os.path.dirname(os.getcwd())
else:
pass
if save_dir==None:
save_dir = load_dir
else:
pass
if module_name==None:
module_name = "tfmodule"
else:
pass
if dataset_name==None:
dataset_name = "encodeddata"
else:
pass
# create save directory, if not available,
try:
os.mkdir(save_dir)
if verbose==True:
print(f"Created: {save_dir}")
except:
pass
# ........................................................................
# Create iterators for each dataset,
# Create Image generator, with rescaling, - I assumed we will wirk on rgb pictures,
datagen = ImageDataGenerator(rescale=1/255)
#.. collect iterators for each subset, if available, and place them in dictionary,
iterators_dct = dict()
# .. - there is more then one subset of the data, eg validation and test and train data
if subset_names!=None:
for setname in subset_names:
iterators_dct[setname] = datagen.flow_from_directory(
os.path.join(load_dir, setname), # each subset is loaded separately,
#batch_size=generator_batch_size, # it will use only max available pictures in a given folder,
batch_size=1, # it will use only max available pictures in a given folder,
target_size=img_target_size,
shuffle=generator_shuffle # here I use generator only to explote the data
)
else:
iterators_dct[dataset_name] = datagen.flow_from_directory(
load_dir,
batch_size=generator_batch_size, # it will use only max available pictures in a given folder,
target_size=img_target_size,
shuffle=generator_shuffle # here I use generator only to explote the data
)
# ..
if verbose==True:
print(f"\n\n{''.join(['.']*80)}\n Creating DataGenerators for: {dataset_name}; {module_name};\n{''.join(['.']*80)}\n")
else:
pass
# .......................................................................
# finally, in case there are no subsets, use the same name for subset as for dataset in all subsequent datasets,
if subset_names==None:
subset_names = [dataset_name]
else:
pass
"important - it must be done here !"
# .......................................................................
# Create tf graph,
img_graph = tf.Graph()
with img_graph.as_default():
# load the module, by default i set TF 1.x models, and if they dont work, I assumed that you will load TF 2.X modules with hub.load() functions used as below
try:
feature_extractor = hub.Module(module_location, trainable=False) # tf1 modules, tahes a bit of time, to check if there is an error, but it was the fasted I could use with coding,
except:
feature_extractor = hub.KerasLayer(module_location) # tf2 modules,
# Create input placeholder for imput data,
input_imgs = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, img_target_size[0], img_target_size[1], 3])
# A node with the extracted high-level features
imgs_features = feature_extractor(input_imgs)
# Collect initializers
init_op = tf.group([ # groups different type of initializers into one init_op
tf.compat.v1.global_variables_initializer(), # used for almost all tf graphs,
tf.compat.v1.tables_initializer() # speciffic for tfhub
])
img_graph.finalize() # Good practice: make the graph "read-only"
#.. create and initialize the session,
"following instruction on: https://www.tensorflow.org/hub/common_issues"
sess = tf.compat.v1.Session(graph=img_graph)
sess.run(init_op)
#.. info on tf graph,
if verbose==True:
print(f"\n{''.join(['.']*80)}\n TF Graph;")
print(f"Feature extraction Module, from: {module_location}")
#print("Signatures: ",feature_extractor.get_signature_names()) # eg: ['default', 'image_feature_vector'] - signatures, can be used to extract subset of NN layers
#print("Expected Imput shape, dtype: ", feature_extractor.get_input_info_dict()) # eg: {'images': <hub.ParsedTensorInfo shape=(?, 224, 224, 3) dtype=float32 is_sparse=False>}
#print("Expected Output shape, dtype: ",feature_extractor.get_output_info_dict()) # eg: {'default': <hub.ParsedTensorInfo shape=(?, 1280) dtype=float32 is_sparse=False>}
# .......................................................................
# Use iterators, and tf graph, extract high level features,
"extract features and labels with each batch, and save as npz, file"
# objects to store new before saving or returning,
file_sets = list()
encoded_batch_dict = dict()
batch_labels_dict = dict()
# ..
for setname in subset_names:
# img batch encoding, ...............................................
#.. Load batch,
img_batch, img_labels = next(iterators_dct[setname])
#.. Extract features from the batch of images,
img_batch_features = sess.run(imgs_features, feed_dict={input_imgs: img_batch})
#.. add to dict
encoded_batch_dict[setname] = img_batch_features
# create human-friendly labels, .....................................
'prepare info on each image'
to_forget, img_batch_info = create_augmented_images(external_generator=iterators_dct[setname], augm_img_nr=0) # used only to get info df,
#.. add to dict
batch_labels_dict[setname] = img_batch_info
# info
if verbose==True:
print(f"{''.join(['.']*80)}\n Ecoding imgages in one batch for < {setname} > dataset;")
print(f"Feature number = {img_batch_features.shape}")
print(f"label table shape = {img_batch_info.shape}")
# save files in save_dir, ...........................................
if save_files==True:
# save img_batch_features as .npy file
os.chdir(save_dir)
encoded_img_file_name = f"{module_name}_{dataset_name}_{setname}_encoded.npy"
np.save(encoded_img_file_name , img_batch_features)
# save labels as csv,
batch_label_file_name = f"{module_name}_{dataset_name}_{setname}_labels.csv"
img_batch_info.to_csv(batch_label_file_name, header=True, index=False)
# info on the above,
if verbose==True:
print(f"Saved as:\n{encoded_img_file_name} and {batch_label_file_name}")
print(f"saved in:\n{os.getcwd()}")
# create log file required for easy loading of the batch and lebel files,
file_sets.append({
"module_name": module_name,
"datasetname": setname,
"img_batch_features_filename": encoded_img_file_name,
"img_batch_info_filename": batch_label_file_name,
"batch_size": img_batch_info.shape[0],
"created_dt":pd.to_datetime("now"),
"module_source": module_location
})
# .......................................................................
# Save log table for entire datset, with all subsets in different rows,
if save_files==True:
os.chdir(save_dir)
summary_table_filename = f"{module_name}_{dataset_name}_logfile.csv"
summary_table = pd.DataFrame(file_sets)
summary_table.to_csv(summary_table_filename, header=True, index=False)
#.. info
if verbose==True:
print(f"{''.join(['.']*80)}\n Creating logfile for < {dataset_name} >;")
print(f"saved as: {summary_table_filename}")
print(f"in: {save_dir}")
# return the batch and labels, just in case,
return encoded_batch_dict, batch_labels_dict
# preset values
generator_batch_size = 3000 # no more then 3000 images will be taken, but we expect no more then 2000 in that tassk.
use_url = "yes" # the script is adapted only to use sys.path, but configs carries url's and ulr can be used with feature extraction function
# extract features from images in each dataset varinat using one or more tf hub modules,
for dv_i, dataset_variant in enumerate(DATASET_VARIANTS):
print(f"\n- {dv_i} - Extracting features from: {dataset_variant}")
# find names off train/valid/test subsets in dataset folder,
os.chdir(os.path.join(PATH_interim, f"{DATASET_NAME}__{dataset_variant}"))
subset_names_to_encode = []
for file in glob.glob(f"[train|valid|test]*"):
subset_names_to_encode.append(file)
# Create lists with info required for feture extraction from images
'this step is super usefull when many models is used for feature extraction'
tfmodules = list(TFHUB_MODELS.keys()) # names of tf hub models used
module_names = [TFHUB_MODELS[x]['module_name'] for x in tfmodules]
#module_file_names = [TFHUB_MODELS[x]['file_name'] for x in tfmodules]
module_file_names = [TFHUB_MODELS[x]['module_url'] for x in tfmodules]
img_imput_size = [TFHUB_MODELS[x]['input_size'] for x in tfmodules]
# extract features, from images from each subset, and store them togther as one batch array,
for i, (one_module_name, one_module_file_name, one_img_input_size) in enumerate(zip(module_names, module_file_names, img_imput_size)):
'''
all data subsets found in load_dir will be encoded automatically,
- logfile will be created for a given datasets
- batch_labels csv file and npz file with encoded features will be created for
each data subset will have:
-
'''
print("\n ................................................")
print(f" - {dv_i}/{i} module: {one_module_name}")
print(f" - {dv_i}/{i} filename or url: {one_module_file_name}")
print(f" - {dv_i}/{i} RGB image size : {one_img_input_size}")
print(f" - {dv_i}/{i} datset subsets : {subset_names_to_encode}")
print(f" - Cataloging subsets, then extracting features from all images")
print(f" - Important: Each subset will be saved as one matrix")
print("\n")
# I am using modules saved in computer memory, thus I need to build fiull path to them,
if use_url=="no":
one_module_full_path = os.path.join(PATH_models, one_module_file_name)
else:
one_module_full_path = one_module_file_name # here I am using module url, (no path)
# extract features
encode_images(
# .. dastaset name & directories,
dataset_name = f"{DATASET_NAME}__{dataset_variant}",# name used when saving encoded files, logfiles and other things, related to encoding,
subset_names = subset_names_to_encode,# list, ust names of files in the load_dir, if any,
load_dir = os.path.join(PATH_interim, f"{DATASET_NAME}__{dataset_variant}"), # full path to input data, ie. file folder with either folders with images names after class names, or folders with subsetnames, and folders names after each class in them,
save_dir = os.path.join(PATH_interim, f"{DATASET_NAME}__{dataset_variant}__extracted_beta_features"), # all new files, will be saved as one batch, with logfile, if None, load_dir will be used,
# .. encoding module parameters,
module_name = one_module_name, # name used when saving files
module_location = one_module_full_path, # full path to a given module, or url,
img_target_size = one_img_input_size, # image resolution in pixels,
generator_batch_size = generator_batch_size, # must be larger or equal to in size of the largest subset
generator_shuffle = False,
# .. other,
save_files = True,
verbose = False
)
```
%%%% Output: stream
- 0 - Extracting features from: cancer_detection_and_classification
................................................
- 0/0 module: MobileNet_v2
- 0/0 filename or url: https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/2
- 0/0 RGB image size : (224, 224)
- 0/0 datset subsets : ['test_02', 'valid_01', 'train_01', 'test_01']
- Cataloging subsets, then extracting features from all images
- Important: Each subset will be saved as one matrix
Found 73 images belonging to 7 classes.