| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415 |
- # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import os
- import os.path as osp
- import numpy as np
- import glob
- import tqdm
- from paddlex.interpret.as_data_reader.readers import read_image
- import paddlex.utils.logging as logging
- from . import lime_base
- from ._session_preparation import compute_features_for_kmeans, gen_user_home
- import paddlex.utils.logging as logging
- def load_kmeans_model(fname):
- import pickle
- with open(fname, 'rb') as f:
- kmeans_model = pickle.load(f)
- return kmeans_model
- def combine_normlime_and_lime(lime_weights, g_weights):
- pred_labels = lime_weights.keys()
- combined_weights = {y: [] for y in pred_labels}
- for y in pred_labels:
- normlized_lime_weights_y = lime_weights[y]
- lime_weights_dict = {
- tuple_w[0]: tuple_w[1]
- for tuple_w in normlized_lime_weights_y
- }
- normlized_g_weight_y = g_weights[y]
- normlime_weights_dict = {
- tuple_w[0]: tuple_w[1]
- for tuple_w in normlized_g_weight_y
- }
- combined_weights[y] = [
- (seg_k, lime_weights_dict[seg_k] * normlime_weights_dict[seg_k])
- for seg_k in lime_weights_dict.keys()
- ]
- combined_weights[y] = sorted(
- combined_weights[y], key=lambda x: np.abs(x[1]), reverse=True)
- return combined_weights
- def avg_using_superpixels(features, segments):
- one_list = np.zeros((len(np.unique(segments)), features.shape[2]))
- for x in np.unique(segments):
- one_list[x] = np.mean(features[segments == x], axis=0)
- return one_list
- def centroid_using_superpixels(features, segments):
- from skimage.measure import regionprops
- regions = regionprops(segments + 1)
- one_list = np.zeros((len(np.unique(segments)), features.shape[2]))
- for i, r in enumerate(regions):
- one_list[i] = features[int(r.centroid[0] + 0.5), int(r.centroid[1] +
- 0.5), :]
- return one_list
- def get_feature_for_kmeans(feature_map, segments):
- from sklearn.preprocessing import normalize
- centroid_feature = centroid_using_superpixels(feature_map, segments)
- avg_feature = avg_using_superpixels(feature_map, segments)
- x = np.concatenate((centroid_feature, avg_feature), axis=-1)
- x = normalize(x)
- return x
- def precompute_normlime_weights(list_data_,
- predict_fn,
- num_samples=3000,
- batch_size=50,
- save_dir='./tmp'):
- # save lime weights and kmeans cluster labels
- precompute_lime_weights(list_data_, predict_fn, num_samples, batch_size,
- save_dir)
- # load precomputed results, compute normlime weights and save.
- fname_list = glob.glob(
- os.path.join(save_dir, 'lime_weights_s{}*.npy'.format(num_samples)))
- return compute_normlime_weights(fname_list, save_dir, num_samples)
- def save_one_lime_predict_and_kmean_labels(lime_all_weights, image_pred_labels,
- cluster_labels, save_path):
- lime_weights = {}
- for label in image_pred_labels:
- lime_weights[label] = lime_all_weights[label]
- for_normlime_weights = {
- 'lime_weights':
- lime_weights, # a dict: class_label: (seg_label, weight)
- 'cluster': cluster_labels # a list with segments as indices.
- }
- np.save(save_path, for_normlime_weights)
- def precompute_lime_weights(list_data_, predict_fn, num_samples, batch_size,
- save_dir):
- root_path = gen_user_home()
- root_path = osp.join(root_path, '.paddlex')
- h_pre_models = osp.join(root_path, "pre_models")
- if not osp.exists(h_pre_models):
- if not osp.exists(root_path):
- os.makedirs(root_path)
- url = "https://bj.bcebos.com/paddlex/interpret/pre_models.tar.gz"
- pdx.utils.download_and_decompress(url, path=root_path)
- h_pre_models_kmeans = osp.join(h_pre_models, "kmeans_model.pkl")
- kmeans_model = load_kmeans_model(h_pre_models_kmeans)
- for data_index, each_data_ in enumerate(list_data_):
- if isinstance(each_data_, str):
- save_path = "lime_weights_s{}_{}.npy".format(
- num_samples, each_data_.split('/')[-1].split('.')[0])
- save_path = os.path.join(save_dir, save_path)
- else:
- save_path = "lime_weights_s{}_{}.npy".format(num_samples,
- data_index)
- save_path = os.path.join(save_dir, save_path)
- if os.path.exists(save_path):
- logging.info(
- save_path + ' exists, not computing this one.', use_color=True)
- continue
- img_file_name = each_data_ if isinstance(each_data_,
- str) else data_index
- logging.info(
- 'processing ' + img_file_name + ' [{}/{}]'.format(data_index,
- len(list_data_)),
- use_color=True)
- image_show = read_image(each_data_)
- result = predict_fn(image_show)
- result = result[0] # only one image here.
- if abs(np.sum(result) - 1.0) > 1e-4:
- # softmax
- exp_result = np.exp(result)
- probability = exp_result / np.sum(exp_result)
- else:
- probability = result
- pred_label = np.argsort(probability)[::-1]
- # top_k = argmin(top_n) > threshold
- threshold = 0.05
- top_k = 0
- for l in pred_label:
- if probability[l] < threshold or top_k == 5:
- break
- top_k += 1
- if top_k == 0:
- top_k = 1
- pred_label = pred_label[:top_k]
- algo = lime_base.LimeImageInterpreter()
- interpreter = algo.interpret_instance(
- image_show[0],
- predict_fn,
- pred_label,
- 0,
- num_samples=num_samples,
- batch_size=batch_size)
- X = get_feature_for_kmeans(
- compute_features_for_kmeans(image_show).transpose((1, 2, 0)),
- interpreter.segments)
- try:
- cluster_labels = kmeans_model.predict(X)
- except AttributeError:
- from sklearn.metrics import pairwise_distances_argmin_min
- cluster_labels, _ = pairwise_distances_argmin_min(
- X, kmeans_model.cluster_centers_)
- save_one_lime_predict_and_kmean_labels(
- interpreter.local_weights, pred_label, cluster_labels, save_path)
- def compute_normlime_weights(a_list_lime_fnames, save_dir, lime_num_samples):
- normlime_weights_all_labels = {}
- for f in a_list_lime_fnames:
- try:
- lime_weights_and_cluster = np.load(f, allow_pickle=True).item()
- lime_weights = lime_weights_and_cluster['lime_weights']
- cluster = lime_weights_and_cluster['cluster']
- except:
- logging.info('When loading precomputed LIME result, skipping' +
- str(f))
- continue
- logging.info('Loading precomputed LIME result,' + str(f))
- pred_labels = lime_weights.keys()
- for y in pred_labels:
- normlime_weights = normlime_weights_all_labels.get(y, {})
- w_f_y = [abs(w[1]) for w in lime_weights[y]]
- w_f_y_l1norm = sum(w_f_y)
- for w in lime_weights[y]:
- seg_label = w[0]
- weight = w[1] * w[1] / w_f_y_l1norm
- a = normlime_weights.get(cluster[seg_label], [])
- a.append(weight)
- normlime_weights[cluster[seg_label]] = a
- normlime_weights_all_labels[y] = normlime_weights
- # compute normlime
- for y in normlime_weights_all_labels:
- normlime_weights = normlime_weights_all_labels.get(y, {})
- for k in normlime_weights:
- normlime_weights[k] = sum(normlime_weights[k]) / len(
- normlime_weights[k])
- # check normlime
- if len(normlime_weights_all_labels.keys()) < max(
- normlime_weights_all_labels.keys()) + 1:
- logging.info(
- "\n" + \
- "Warning: !!! \n" + \
- "There are at least {} classes, ".format(max(normlime_weights_all_labels.keys()) + 1) + \
- "but the NormLIME has results of only {} classes. \n".format(len(normlime_weights_all_labels.keys())) + \
- "It may have cause unstable results in the later computation" + \
- " but can be improved by computing more test samples." + \
- "\n"
- )
- n = 0
- f_out = 'normlime_weights_s{}_samples_{}-{}.npy'.format(
- lime_num_samples, len(a_list_lime_fnames), n)
- while os.path.exists(os.path.join(save_dir, f_out)):
- n += 1
- f_out = 'normlime_weights_s{}_samples_{}-{}.npy'.format(
- lime_num_samples, len(a_list_lime_fnames), n)
- continue
- np.save(os.path.join(save_dir, f_out), normlime_weights_all_labels)
- return os.path.join(save_dir, f_out)
- def precompute_global_classifier(dataset,
- predict_fn,
- save_path,
- batch_size=50,
- max_num_samples=1000):
- from sklearn.linear_model import LogisticRegression
- root_path = gen_user_home()
- root_path = osp.join(root_path, '.paddlex')
- h_pre_models = osp.join(root_path, "pre_models")
- if not osp.exists(h_pre_models):
- if not osp.exists(root_path):
- os.makedirs(root_path)
- url = "https://bj.bcebos.com/paddlex/interpret/pre_models.tar.gz"
- pdx.utils.download_and_decompress(url, path=root_path)
- h_pre_models_kmeans = osp.join(h_pre_models, "kmeans_model.pkl")
- kmeans_model = load_kmeans_model(h_pre_models_kmeans)
- image_list = []
- for item in dataset.file_list:
- image_list.append(item[0])
- x_data = []
- y_labels = []
- num_features = len(kmeans_model.cluster_centers_)
- logging.info(
- "Initialization for NormLIME: Computing each sample in the test list.",
- use_color=True)
- for each_data_ in tqdm.tqdm(image_list):
- x_data_i = np.zeros((num_features))
- image_show = read_image(each_data_)
- result = predict_fn(image_show)
- result = result[0] # only one image here.
- c = compute_features_for_kmeans(image_show).transpose((1, 2, 0))
- segments = np.zeros((image_show.shape[1], image_show.shape[2]),
- np.int32)
- num_blocks = 10
- height_per_i = segments.shape[0] // num_blocks + 1
- width_per_i = segments.shape[1] // num_blocks + 1
- for i in range(segments.shape[0]):
- for j in range(segments.shape[1]):
- segments[i,
- j] = i // height_per_i * num_blocks + j // width_per_i
- # segments = quickshift(image_show[0], sigma=1)
- X = get_feature_for_kmeans(c, segments)
- try:
- cluster_labels = kmeans_model.predict(X)
- except AttributeError:
- from sklearn.metrics import pairwise_distances_argmin_min
- cluster_labels, _ = pairwise_distances_argmin_min(
- X, kmeans_model.cluster_centers_)
- for c in cluster_labels:
- x_data_i[c] = 1
- # x_data_i /= len(cluster_labels)
- pred_y_i = np.argmax(result)
- y_labels.append(pred_y_i)
- x_data.append(x_data_i)
- if len(np.unique(y_labels)) < 2:
- logging.info("Warning: The test samples in the dataset is limited.\n \
- NormLIME may have no effect on the results.\n \
- Try to add more test samples, or see the results of LIME.")
- num_classes = np.max(np.unique(y_labels)) + 1
- normlime_weights_all_labels = {}
- for class_index in range(num_classes):
- w = np.ones((num_features)) / num_features
- normlime_weights_all_labels[class_index] = {
- i: wi
- for i, wi in enumerate(w)
- }
- logging.info("Saving the computed normlime_weights in {}".format(
- save_path))
- np.save(save_path, normlime_weights_all_labels)
- return save_path
- clf = LogisticRegression(multi_class='multinomial', max_iter=1000)
- clf.fit(x_data, y_labels)
- num_classes = np.max(np.unique(y_labels)) + 1
- normlime_weights_all_labels = {}
- if len(y_labels) / len(np.unique(y_labels)) < 3:
- logging.info("Warning: The test samples in the dataset is limited.\n \
- NormLIME may have no effect on the results.\n \
- Try to add more test samples, or see the results of LIME.")
- if len(np.unique(y_labels)) == 2:
- # binary: clf.coef_ has shape of [1, num_features]
- for class_index in range(num_classes):
- if class_index not in clf.classes_:
- w = np.ones((num_features)) / num_features
- normlime_weights_all_labels[class_index] = {
- i: wi
- for i, wi in enumerate(w)
- }
- continue
- if clf.classes_[0] == class_index:
- w = -clf.coef_[0]
- else:
- w = clf.coef_[0]
- # softmax
- w = w - np.max(w)
- exp_w = np.exp(w * 10)
- w = exp_w / np.sum(exp_w)
- normlime_weights_all_labels[class_index] = {
- i: wi
- for i, wi in enumerate(w)
- }
- else:
- # clf.coef_ has shape of [len(np.unique(y_labels)), num_features]
- for class_index in range(num_classes):
- if class_index not in clf.classes_:
- w = np.ones((num_features)) / num_features
- normlime_weights_all_labels[class_index] = {
- i: wi
- for i, wi in enumerate(w)
- }
- continue
- coef_class_index = np.where(clf.classes_ == class_index)[0][0]
- w = clf.coef_[coef_class_index]
- # softmax
- w = w - np.max(w)
- exp_w = np.exp(w * 10)
- w = exp_w / np.sum(exp_w)
- normlime_weights_all_labels[class_index] = {
- i: wi
- for i, wi in enumerate(w)
- }
- logging.info("Saving the computed normlime_weights in {}".format(
- save_path))
- np.save(save_path, normlime_weights_all_labels)
- return save_path
|