From c41ee2b7f125c640f8d32444fe59a0e29e376e5a Mon Sep 17 00:00:00 2001 From: "hannes.kuchelmeister" Date: Sat, 9 May 2020 14:29:39 +0200 Subject: [PATCH] add reference for cross validation --- 30_Thesis/sections/60_evaluation.tex | 2 +- 30_Thesis/thesis.bib | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/30_Thesis/sections/60_evaluation.tex b/30_Thesis/sections/60_evaluation.tex index 157bbc2..bf99c5d 100644 --- a/30_Thesis/sections/60_evaluation.tex +++ b/30_Thesis/sections/60_evaluation.tex @@ -180,7 +180,7 @@ The natural group type for the use case is a heterogeneous group but to widen th \subsection{The Effect of Stored Finished Configurations} -Another important component of the evaluation is the influence of stored finished configurations. When evaluating a subset of stored finished configurations it is important to avoid outliers. This is the reason why a process inspired by \emph{cross validation} \todo{referenz hinzufügen} is used. The configuration database is randomly ordered and sliced into sub-databases of the needed size. As an example, if the evaluated stored data size is 20, a configuration database containing 100 configurations is split into five sub-databases of size 20. Now the evaluation is carried out for each of the sub-databases and finally the average is determined. This avoids the random picking of a subset which either performs much better than most other possible combinations of databases or which performs much worse. This way the data is more aligned to the expected value. +Another important component of the evaluation is the influence of stored finished configurations. When evaluating a subset of stored finished configurations it is important to avoid outliers. This is the reason why a process inspired by \emph{cross validation} \cite{kohaviStudyCrossValidationBootstrap1995} is used. The configuration database is randomly ordered and sliced into sub-databases of the needed size. As an example, if the evaluated stored data size is 20, a configuration database containing 100 configurations is split into five sub-databases of size 20. Now the evaluation is carried out for each of the sub-databases and finally the average is determined. This avoids the random picking of a subset which either performs much better than most other possible combinations of databases or which performs much worse. This way the data is more aligned to the expected value. \section{Hypotheses} \label{sec:Evaluation:Hypotheses} diff --git a/30_Thesis/thesis.bib b/30_Thesis/thesis.bib index b316363..ad1ff26 100644 --- a/30_Thesis/thesis.bib +++ b/30_Thesis/thesis.bib @@ -1005,6 +1005,16 @@ procedure.}, langid = {english} } +@inproceedings{kohaviStudyCrossValidationBootstrap1995, + title = {A {{Study}} of {{Cross}}-{{Validation}} and {{Bootstrap}} for {{Accuracy Estimation}} and {{Model Selection}}}, + author = {Kohavi, Ron}, + date = {1995}, + pages = {1137--1143}, + publisher = {{Morgan Kaufmann}}, + abstract = {We review accuracy estimation methods and compare the two most common methods: crossvalidation and bootstrap. Recent experimental results on artificial data and theoretical results in restricted settings have shown that for selecting a good classifier from a set of classifiers (model selection), ten-fold cross-validation may be better than the more expensive leaveone -out cross-validation. We report on a largescale experiment---over half a million runs of C4.5 and a Naive-Bayes algorithm---to estimate the effects of different parameters on these algorithms on real-world datasets. For crossvalidation, we vary the number of folds and whether the folds are stratified or not; for bootstrap, we vary the number of bootstrap samples. Our results indicate that for real-word datasets similar to ours, the best method to use for model selection is ten-fold stratified cross validation, even if computation power allows using more folds. 1 Introduction It can not be emphasized enough that no claim ...}, + file = {C\:\\Users\\Hannes.Kuchelmeister\\Zotero\\storage\\GGH5NYBZ\\Kohavi_1995_A Study of Cross-Validation and Bootstrap for Accuracy Estimation and Model.pdf;C\:\\Users\\Hannes.Kuchelmeister\\Zotero\\storage\\M7BT7CCG\\summary.html} +} + @online{kuchelmeister13hannes11BachelorThesis, title = {13hannes11/Bachelor\_thesis\_m.Recommend}, author = {Kuchelmeister, Hannes F.},