From c41ee2b7f125c640f8d32444fe59a0e29e376e5a Mon Sep 17 00:00:00 2001
From: "hannes.kuchelmeister" <hannes.kuchelmeister@cas.de>
Date: Sat, 9 May 2020 14:29:39 +0200
Subject: [PATCH] add reference for cross validation

---
 30_Thesis/sections/60_evaluation.tex |  2 +-
 30_Thesis/thesis.bib                 | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/30_Thesis/sections/60_evaluation.tex b/30_Thesis/sections/60_evaluation.tex
index 157bbc2..bf99c5d 100644
--- a/30_Thesis/sections/60_evaluation.tex
+++ b/30_Thesis/sections/60_evaluation.tex
@@ -180,7 +180,7 @@ The natural group type for the use case is a heterogeneous group but to widen th
 
 \subsection{The Effect of Stored Finished Configurations}
 
-Another important component of the evaluation is the influence of stored finished configurations. When evaluating a subset of stored finished configurations it is important to avoid outliers. This is the reason why a process inspired by \emph{cross validation} \todo{referenz hinzufügen} is used. The configuration database is randomly ordered and sliced into sub-databases of the needed size. As an example, if the evaluated stored data size is 20, a configuration database containing 100 configurations is split into five sub-databases of size 20. Now the evaluation is carried out for each of the sub-databases and finally the average is determined. This avoids the random picking of a subset which either performs much better than most other possible combinations of databases or which performs much worse. This way the data is more aligned to the expected value.
+Another important component of the evaluation is the influence of stored finished configurations. When evaluating a subset of stored finished configurations it is important to avoid outliers. This is the reason why a process inspired by \emph{cross validation} \cite{kohaviStudyCrossValidationBootstrap1995} is used. The configuration database is randomly ordered and sliced into sub-databases of the needed size. As an example, if the evaluated stored data size is 20, a configuration database containing 100 configurations is split into five sub-databases of size 20. Now the evaluation is carried out for each of the sub-databases and finally the average is determined. This avoids the random picking of a subset which either performs much better than most other possible combinations of databases or which performs much worse. This way the data is more aligned to the expected value.
 
 \section{Hypotheses}
 \label{sec:Evaluation:Hypotheses}
diff --git a/30_Thesis/thesis.bib b/30_Thesis/thesis.bib
index b316363..ad1ff26 100644
--- a/30_Thesis/thesis.bib
+++ b/30_Thesis/thesis.bib
@@ -1005,6 +1005,16 @@ procedure.},
   langid = {english}
 }
 
+@inproceedings{kohaviStudyCrossValidationBootstrap1995,
+  title = {A {{Study}} of {{Cross}}-{{Validation}} and {{Bootstrap}} for {{Accuracy Estimation}} and {{Model Selection}}},
+  author = {Kohavi, Ron},
+  date = {1995},
+  pages = {1137--1143},
+  publisher = {{Morgan Kaufmann}},
+  abstract = {We review accuracy estimation methods and compare the two most common methods: crossvalidation and bootstrap. Recent experimental results on artificial data and theoretical results in restricted settings have shown that for selecting a good classifier from a set of classifiers (model selection), ten-fold cross-validation may be better than the more expensive leaveone -out cross-validation. We report on a largescale experiment---over half a million runs of C4.5 and a Naive-Bayes algorithm---to estimate the effects of different parameters on these algorithms on real-world datasets. For crossvalidation, we vary the number of folds and whether the folds are stratified or not; for bootstrap, we vary the number of bootstrap samples. Our results indicate that for real-word datasets similar to ours, the best method to use for model selection is ten-fold stratified cross validation, even if computation power allows using more folds. 1 Introduction It can not be emphasized enough that no claim ...},
+  file = {C\:\\Users\\Hannes.Kuchelmeister\\Zotero\\storage\\GGH5NYBZ\\Kohavi_1995_A Study of Cross-Validation and Bootstrap for Accuracy Estimation and Model.pdf;C\:\\Users\\Hannes.Kuchelmeister\\Zotero\\storage\\M7BT7CCG\\summary.html}
+}
+
 @online{kuchelmeister13hannes11BachelorThesis,
   title = {13hannes11/Bachelor\_thesis\_m.Recommend},
   author = {Kuchelmeister, Hannes F.},