diff --git a/graphics/F1_trust.png b/graphics/F1_trust.png new file mode 100644 index 0000000..ba50473 Binary files /dev/null and b/graphics/F1_trust.png differ diff --git a/graphics/avg_acc_humans.png b/graphics/avg_acc_humans.png new file mode 100644 index 0000000..6b03eac Binary files /dev/null and b/graphics/avg_acc_humans.png differ diff --git a/graphics/graph_trust.png b/graphics/graph_trust.png new file mode 100644 index 0000000..6f79233 Binary files /dev/null and b/graphics/graph_trust.png differ diff --git a/graphics/husky_wolf_expla.png b/graphics/husky_wolf_expla.png new file mode 100644 index 0000000..f3ae0d0 Binary files /dev/null and b/graphics/husky_wolf_expla.png differ diff --git a/graphics/husky_wolf_img.png b/graphics/husky_wolf_img.png new file mode 100644 index 0000000..1b65e27 Binary files /dev/null and b/graphics/husky_wolf_img.png differ diff --git a/graphics/picking_features_human.png b/graphics/picking_features_human.png new file mode 100644 index 0000000..6841ffa Binary files /dev/null and b/graphics/picking_features_human.png differ diff --git a/pres.pdf b/pres.pdf new file mode 100644 index 0000000..fef93e2 Binary files /dev/null and b/pres.pdf differ diff --git a/pres.tex b/pres.tex index 9fdbe1e..a155324 100644 --- a/pres.tex +++ b/pres.tex @@ -104,12 +104,27 @@ \begin{frame} \frametitle{Previous Solutions} \begin{itemize} + % Practitioners consistently overestimate their model’s accuracy [20], propagate feedback loops [23], or fail to notice data leaks \item Relying on accuracy based on validation set + \item Gestalt + \item Modeltracker + \begin{itemize} + \item Help users navigate individual instances. + \item Complementary to LIME in terms of explaining models, since they do not address the problem of explaining individual predictions. + \item The our submodular pick procedure of LIME can be incorporated in such tools to aid users in navigating larger datasets. + \end{itemize} + \item Recognizing the utility of explanations in assessing trust, many have proposed using interpretable models + \begin{itemize} + \item May generalize poorly, if data can't be explained in few dimensions + \item So interpretability, in these cases, comes at the cost of flexibility, accuracy, or efficiency + \end{itemize} + \end{itemize} \end{frame} \begin{frame} + % It becomes clear the dataset has issues, as there is a fake correlation between the header information and the class Atheism. It is also clear what the problems are, and the steps that can be taken to fix these issues and train a more trustworthy classifier. \frametitle{A look into two predictions} \includegraphics[scale=0.25]{graphics/christ_vs_atheism.png} \end{frame} @@ -239,14 +254,6 @@ % K-Lasso is the procedure of learning the weights via least squares. Wtf are these weights??? - The features \end{frame} - - \begin{frame} - \frametitle{Text Classification} - \end{frame} - - \begin{frame} - \frametitle{Deep Networks for Images} - \end{frame} \subsection{Explaining Models} \begin{frame} @@ -376,7 +383,11 @@ \begin{frame} \frametitle{Faithfulness} \begin{itemize} - \item Faithfulness of explanations is measured on classifiers that are interpretable, \textbf{LR} and \textbf{DT}. Both are trained s.t. the max no. of features is $10$, so features found by these are the \emph{gold standard} of features, in regards to which are important. + \item Faithfulness of explanations is measured on classifiers that are interpretable, \textbf{LR} and \textbf{DT}. + \begin{itemize} + \item Both are trained s.t. the max no. of features which they can find is $10$, so features found by these are the \emph{gold standard} of features, in regards to which features are important. + \end{itemize} + \item For each prediction on the test set, explanations are produced and the fraction of the gold features found, is computed. \end{itemize} \end{frame} @@ -394,12 +405,69 @@ \end{frame} \begin{frame} - \frametitle{Simulated human shenanigans} - % In statistical analysis of binary classification, the F1 score (also F-score or F-measure) is a measure of a test's accuracy. It considers both the precision p and the recall r of the test to compute the score: p is the number of correct positive results divided by the number of all positive results returned by the classifier, and r is the number of correct positive results divided by the number of all relevant samples (all samples that should have been identified as positive). The F1 score is the harmonic mean of the precision and recall, where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0. + \frametitle{Should I trust this prediction?} + % In statistical analysis of binary classification, the F1 score (also F-score or F-measure) is a measure of a test's accuracy. It considers both the precision p and the recall r of the test to compute the score: p is the number of correct positive results divided by the number of all positive results returned by the classifier, and r is the number of correct positive results divided by the number of all relevant samples (all samples that should have been identified as positive). The F1 score is the harmonic mean of the precision and recall, where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0. + + % Seems kind of unfair, that random and greedy is mistrusted by simply having an unstrutworthy feature in their explanation, while LIME and parzen just have to not change, when these untrustworthy are removed. + \includegraphics[scale=0.4]{graphics/F1_trust.png} \end{frame} + \begin{frame} + \frametitle{Can I trust this model?} + \begin{itemize} + \item Evaluate if explanations can be used for model selection + \item They add 10 artificially “noisy” features s.t. + \begin{itemize} + \item Each artificial feature appears in 10\% of the examples in one class, and 20\% of the other in the training/validation data. + \item While on the test instances, each artificial feature appears in 10\% of the examples in each class. + \end{itemize} + \item Results in models both using actual informative features, but also ones creating random correlations. + \item Pairs of competing classifiers are computed by repeatedly training pairs of random forests with 30 trees until their validation accuracy is within 0.1\% of each other, but their test accuracy differs by at least 5\%. + \end{itemize} + \end{frame} + + \begin{frame} + \frametitle{Can I trust this model?} + % They evaluate whether the explanations can be used for model selection, simulating the case where a human has to decide between two competing models with similar accuracy on validation data. + % Accomplished by "marking" the artificial features found within the B instances seen, as unstrustworthy. We then evaluate how many total predictions in the validation set should be trusted (as in the previous section, treating only marked features as untrustworthy). + % SP-parzen and RP-parzen are omittedfrom the figure since they did not produce useful explanations, performing only slightly better than random. Is this ok? + \includegraphics[scale=0.4]{graphics/graph_trust.png} + \end{frame} + + \begin{frame} + \frametitle{Can humans pick the best classifier?} + \includegraphics[scale=0.35]{graphics/avg_acc_humans.png} + \end{frame} + \begin{frame} \frametitle{Can non-experts improve a classifier?} + \includegraphics[scale=0.4]{graphics/picking_features_human.png} + \end{frame} + + \begin{frame} + \frametitle{Can we learn something from the explanations?} + % Hand picked images to create the correlation between wolf and snow, s.t. the classifier miss-predicts whenever a husky is in snow or a wolf is without snow + \center + \includegraphics[scale=0.2]{graphics/husky_wolf_img.png} + \end{frame} + + \begin{frame} + \frametitle{Can we learn something from the explanations?} + \begin{itemize} + \item Present 10 predictions without explanations % Such as the previous image (a) + \begin{itemize} + \item 2 are miss-predictions with a husky in snow and a wolf without snow, the rest are correct + \end{itemize} + \item Ask three questions: + \begin{enumerate} + \item Do you trust this algorithm to generalize? + \item Why? + \item How do you think the algorithm distinguishes? + \end{enumerate} + \item Results shown in table, before and after having seen the explanations. + \end{itemize} + \center + \includegraphics[scale=0.3]{graphics/husky_wolf_expla.png} \end{frame} %\subsection{Human Subjects} @@ -407,10 +475,26 @@ \begin{frame} \frametitle{Conclusion} + \begin{itemize} + \item They argue that trust is crucial for effective human interaction with machine learning systems + \item Explaining individual predictions is important in assessing trust + \item They proposed LIME, a modular and extensible ap- proach to faithfully explain the predictions of any model in an interpretable manner + \item They introduced SP-LIME, a method to select representative and non-redundant predictions, providing a global view of the model to users. + \item Experiments demonstrated that explanations are useful for a variety of models in trust-related tasks in the text and image domains + \end{itemize} \end{frame} \begin{frame} \frametitle{Future work} + \begin{itemize} + \item They use only sparse linear models as explanations, our framework supports the exploration of a variety of explanation families, such as DTs. + \begin{itemize} + \item This estimate of faithfulness can also be used for selecting an appropriate family of explanations from a set of multiple interpretable model classes, thus adapting to the given dataset and the classifier. + \end{itemize} + \item One issue that they do not mention in this work was how to perform the pick step for images. + \item They would like to investigate potential uses in speech, video, and medical domains, as well as recommendation systems. + \item They would like to explore theoretical properties (such as the appropriate number of samples) and computational optimizations (such as using parallelization and GPU processing) + \end{itemize} \end{frame} \section{Recap} \begin{frame}