xgbTree caret matrix or not?











up vote
0
down vote

favorite












I am running for example the following code:



v.ctrl <- trainControl(method = "repeatedcv", repeats = 1,number = 3, 
summaryFunction = twoClassSummary,
classProbs = TRUE,
allowParallel=T)

xgb.grid <- expand.grid(nrounds = 10000,
eta = c(0.01,0.05,0.1),
max_depth = c(2,4,6,8,10,14))
set.seed(45)
xgb_tune <-train(target~.,
data = train,
method = "xgbTree",
trControl = cv.ctrl,
tuneGrid = xgb.grid,
verbose = TRUE,
metric = "LogLoss",
nthread = 3)


The error is simple:




Error in train(target ~ ., data = train, method = "xgbTree", trControl = cv.ctrl, :
unused arguments (data = train, method = "xgbTree", trControl = cv.ctrl, tuneGrid = xgb.grid, verbose = T, metric = "LogLoss", nthread = 3)




My dataset



structure(list(feature19 = c(0.58776, 0.40764, 0.4708, 0.67577, 0.41681, 0.5291, 0.33197, 0.24138, 0.49776, 0.58293), feature6 = c(0.48424, 0.48828, 0.58975, 0.33185, 0.6917, 0.53813, 0.76235, 0.7036,     0.33871, 0.51928), feature10 = c(0.61347, 0.65801, 0.69926, 0.23311,     0.8134, 0.55321, 0.72926, 0.663, 0.49206, 0.55531), feature20 = c(0.39615,     0.49085, 0.50274, 0.6038, 0.37487, 0.53582, 0.62004, 0.63819,     0.37858, 0.40478), feature7 = c(0.55901, 0.38715, 0.50705, 0.76004,     0.3207, 0.54697, 0.31014, 0.21932, 0.4831, 0.52253), feature4 = c(0.5379,     0.52526, 0.44264, 0.28974, 0.65142, 0.41382, 0.44205, 0.47272,     0.6303, 0.56405), feature16 = c(0.41849, 0.45628, 0.37617, 0.39334, 0.46727, 0.36297, 0.3054, 0.41256, 0.6302, 0.41892), feature2 = c(0.62194,  0.5555, 0.61301, 0.27452, 0.74148, 0.49785, 0.5215, 0.46492,     0.54834, 0.58106), feature21 = c(0.32122, 0.37679, 0.35889, 0.74368,     0.18306, 0.47027, 0.40567, 0.47801, 0.41617, 0.35244), feature12 = c(0.56532,     0.55707, 0.49138, 0.24911, 0.69341, 0.42176, 0.41445, 0.45535,     0.62379, 0.5523), target = c(1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L,     1L, 1L)), .Names = c("feature19", "feature6", "feature10", "feature20",     "feature7", "feature4", "feature16", "feature2", "feature21",     "feature12", "target"), row.names = c(NA, 10L), class = "data.frame")


Does anyone know whether I have to reprocess the data for xgbtree?
Thx u!










share|improve this question




























    up vote
    0
    down vote

    favorite












    I am running for example the following code:



    v.ctrl <- trainControl(method = "repeatedcv", repeats = 1,number = 3, 
    summaryFunction = twoClassSummary,
    classProbs = TRUE,
    allowParallel=T)

    xgb.grid <- expand.grid(nrounds = 10000,
    eta = c(0.01,0.05,0.1),
    max_depth = c(2,4,6,8,10,14))
    set.seed(45)
    xgb_tune <-train(target~.,
    data = train,
    method = "xgbTree",
    trControl = cv.ctrl,
    tuneGrid = xgb.grid,
    verbose = TRUE,
    metric = "LogLoss",
    nthread = 3)


    The error is simple:




    Error in train(target ~ ., data = train, method = "xgbTree", trControl = cv.ctrl, :
    unused arguments (data = train, method = "xgbTree", trControl = cv.ctrl, tuneGrid = xgb.grid, verbose = T, metric = "LogLoss", nthread = 3)




    My dataset



    structure(list(feature19 = c(0.58776, 0.40764, 0.4708, 0.67577, 0.41681, 0.5291, 0.33197, 0.24138, 0.49776, 0.58293), feature6 = c(0.48424, 0.48828, 0.58975, 0.33185, 0.6917, 0.53813, 0.76235, 0.7036,     0.33871, 0.51928), feature10 = c(0.61347, 0.65801, 0.69926, 0.23311,     0.8134, 0.55321, 0.72926, 0.663, 0.49206, 0.55531), feature20 = c(0.39615,     0.49085, 0.50274, 0.6038, 0.37487, 0.53582, 0.62004, 0.63819,     0.37858, 0.40478), feature7 = c(0.55901, 0.38715, 0.50705, 0.76004,     0.3207, 0.54697, 0.31014, 0.21932, 0.4831, 0.52253), feature4 = c(0.5379,     0.52526, 0.44264, 0.28974, 0.65142, 0.41382, 0.44205, 0.47272,     0.6303, 0.56405), feature16 = c(0.41849, 0.45628, 0.37617, 0.39334, 0.46727, 0.36297, 0.3054, 0.41256, 0.6302, 0.41892), feature2 = c(0.62194,  0.5555, 0.61301, 0.27452, 0.74148, 0.49785, 0.5215, 0.46492,     0.54834, 0.58106), feature21 = c(0.32122, 0.37679, 0.35889, 0.74368,     0.18306, 0.47027, 0.40567, 0.47801, 0.41617, 0.35244), feature12 = c(0.56532,     0.55707, 0.49138, 0.24911, 0.69341, 0.42176, 0.41445, 0.45535,     0.62379, 0.5523), target = c(1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L,     1L, 1L)), .Names = c("feature19", "feature6", "feature10", "feature20",     "feature7", "feature4", "feature16", "feature2", "feature21",     "feature12", "target"), row.names = c(NA, 10L), class = "data.frame")


    Does anyone know whether I have to reprocess the data for xgbtree?
    Thx u!










    share|improve this question


























      up vote
      0
      down vote

      favorite









      up vote
      0
      down vote

      favorite











      I am running for example the following code:



      v.ctrl <- trainControl(method = "repeatedcv", repeats = 1,number = 3, 
      summaryFunction = twoClassSummary,
      classProbs = TRUE,
      allowParallel=T)

      xgb.grid <- expand.grid(nrounds = 10000,
      eta = c(0.01,0.05,0.1),
      max_depth = c(2,4,6,8,10,14))
      set.seed(45)
      xgb_tune <-train(target~.,
      data = train,
      method = "xgbTree",
      trControl = cv.ctrl,
      tuneGrid = xgb.grid,
      verbose = TRUE,
      metric = "LogLoss",
      nthread = 3)


      The error is simple:




      Error in train(target ~ ., data = train, method = "xgbTree", trControl = cv.ctrl, :
      unused arguments (data = train, method = "xgbTree", trControl = cv.ctrl, tuneGrid = xgb.grid, verbose = T, metric = "LogLoss", nthread = 3)




      My dataset



      structure(list(feature19 = c(0.58776, 0.40764, 0.4708, 0.67577, 0.41681, 0.5291, 0.33197, 0.24138, 0.49776, 0.58293), feature6 = c(0.48424, 0.48828, 0.58975, 0.33185, 0.6917, 0.53813, 0.76235, 0.7036,     0.33871, 0.51928), feature10 = c(0.61347, 0.65801, 0.69926, 0.23311,     0.8134, 0.55321, 0.72926, 0.663, 0.49206, 0.55531), feature20 = c(0.39615,     0.49085, 0.50274, 0.6038, 0.37487, 0.53582, 0.62004, 0.63819,     0.37858, 0.40478), feature7 = c(0.55901, 0.38715, 0.50705, 0.76004,     0.3207, 0.54697, 0.31014, 0.21932, 0.4831, 0.52253), feature4 = c(0.5379,     0.52526, 0.44264, 0.28974, 0.65142, 0.41382, 0.44205, 0.47272,     0.6303, 0.56405), feature16 = c(0.41849, 0.45628, 0.37617, 0.39334, 0.46727, 0.36297, 0.3054, 0.41256, 0.6302, 0.41892), feature2 = c(0.62194,  0.5555, 0.61301, 0.27452, 0.74148, 0.49785, 0.5215, 0.46492,     0.54834, 0.58106), feature21 = c(0.32122, 0.37679, 0.35889, 0.74368,     0.18306, 0.47027, 0.40567, 0.47801, 0.41617, 0.35244), feature12 = c(0.56532,     0.55707, 0.49138, 0.24911, 0.69341, 0.42176, 0.41445, 0.45535,     0.62379, 0.5523), target = c(1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L,     1L, 1L)), .Names = c("feature19", "feature6", "feature10", "feature20",     "feature7", "feature4", "feature16", "feature2", "feature21",     "feature12", "target"), row.names = c(NA, 10L), class = "data.frame")


      Does anyone know whether I have to reprocess the data for xgbtree?
      Thx u!










      share|improve this question















      I am running for example the following code:



      v.ctrl <- trainControl(method = "repeatedcv", repeats = 1,number = 3, 
      summaryFunction = twoClassSummary,
      classProbs = TRUE,
      allowParallel=T)

      xgb.grid <- expand.grid(nrounds = 10000,
      eta = c(0.01,0.05,0.1),
      max_depth = c(2,4,6,8,10,14))
      set.seed(45)
      xgb_tune <-train(target~.,
      data = train,
      method = "xgbTree",
      trControl = cv.ctrl,
      tuneGrid = xgb.grid,
      verbose = TRUE,
      metric = "LogLoss",
      nthread = 3)


      The error is simple:




      Error in train(target ~ ., data = train, method = "xgbTree", trControl = cv.ctrl, :
      unused arguments (data = train, method = "xgbTree", trControl = cv.ctrl, tuneGrid = xgb.grid, verbose = T, metric = "LogLoss", nthread = 3)




      My dataset



      structure(list(feature19 = c(0.58776, 0.40764, 0.4708, 0.67577, 0.41681, 0.5291, 0.33197, 0.24138, 0.49776, 0.58293), feature6 = c(0.48424, 0.48828, 0.58975, 0.33185, 0.6917, 0.53813, 0.76235, 0.7036,     0.33871, 0.51928), feature10 = c(0.61347, 0.65801, 0.69926, 0.23311,     0.8134, 0.55321, 0.72926, 0.663, 0.49206, 0.55531), feature20 = c(0.39615,     0.49085, 0.50274, 0.6038, 0.37487, 0.53582, 0.62004, 0.63819,     0.37858, 0.40478), feature7 = c(0.55901, 0.38715, 0.50705, 0.76004,     0.3207, 0.54697, 0.31014, 0.21932, 0.4831, 0.52253), feature4 = c(0.5379,     0.52526, 0.44264, 0.28974, 0.65142, 0.41382, 0.44205, 0.47272,     0.6303, 0.56405), feature16 = c(0.41849, 0.45628, 0.37617, 0.39334, 0.46727, 0.36297, 0.3054, 0.41256, 0.6302, 0.41892), feature2 = c(0.62194,  0.5555, 0.61301, 0.27452, 0.74148, 0.49785, 0.5215, 0.46492,     0.54834, 0.58106), feature21 = c(0.32122, 0.37679, 0.35889, 0.74368,     0.18306, 0.47027, 0.40567, 0.47801, 0.41617, 0.35244), feature12 = c(0.56532,     0.55707, 0.49138, 0.24911, 0.69341, 0.42176, 0.41445, 0.45535,     0.62379, 0.5523), target = c(1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L,     1L, 1L)), .Names = c("feature19", "feature6", "feature10", "feature20",     "feature7", "feature4", "feature16", "feature2", "feature21",     "feature12", "target"), row.names = c(NA, 10L), class = "data.frame")


      Does anyone know whether I have to reprocess the data for xgbtree?
      Thx u!







      r r-caret






      share|improve this question















      share|improve this question













      share|improve this question




      share|improve this question








      edited Nov 8 at 17:46









      Tjebo

      2,1171126




      2,1171126










      asked Jul 25 '17 at 14:02







      user5898644































          1 Answer
          1






          active

          oldest

          votes

















          up vote
          0
          down vote



          accepted










          I realize I am kind of a noob when it comes to R/Caret/machine learning, but I saw your post after constantly checking for responses to my question and I managed to get your code working. I hope someone more knowledgeable is able to fully answer your questions, but in the meantime, here is what I did.



          First, I inputted your data set into R and tried running your code. I believe you may have a typo in your control function, you are missing a "c" in "cv" which may lead to the issues you are having with unused arguments.



          However, after I resolved that issue there were multiple errors and warnings; for one, you are using twoClassSummary but specifying logLoss (note the syntax here, it's not LogLoss in case that changes anything)...instead I switched this summaryFunction to mnlog to call the logLoss function properly, as from what I've read twoClassSummary uses AUC as its metric. Also, I replaced your "target" variable in your training set with a simple character variable, in this case, "Y" or "N". You can download the csv file here.



          After, I kept receiving an error regarding your tuning grid, stating that essentially your were missing tuning parameters for xgBoost methods which can be found in the documentation for caret (available models). I simply added the default values for the rest of the parameters (most of which were 1). The tuning grid I used can be found here.



          My final code I used to actually train the xgb model was as follows:



          control = trainControl(method = "repeatedcv", repeats = 1, number = 3, 
          summaryFunction = mnLogLoss,
          classProbs = TRUE,
          allowParallel=T)

          tune = train(x=set[,1:10], y=set[,11], method="xgbTree", trControl=control,
          tuneGrid = xgb.grid, verbose=TRUE, metric="logLoss", nthread=3)


          And the output is shown here:



          tune
          eXtreme Gradient Boosting

          10 samples
          10 predictors
          2 classes: 'N', 'Y'

          No pre-processing
          Resampling: Cross-Validated (3 fold, repeated 1 times)
          Summary of sample sizes: 6, 8, 6
          Resampling results across tuning parameters:

          eta max_depth logLoss
          0.01 2 0.6914816
          0.01 4 0.6914816
          0.01 6 0.6914816
          0.01 8 0.6914816
          0.01 10 0.6914816
          0.01 14 0.6914816
          0.05 2 0.6848399
          0.05 4 0.6848399
          0.05 6 0.6848399
          0.05 8 0.6848399
          0.05 10 0.6848399
          0.05 14 0.6848399
          0.10 2 0.6765847
          0.10 4 0.6765847
          0.10 6 0.6765847
          0.10 8 0.6765847
          0.10 10 0.6765847
          0.10 14 0.6765847

          Tuning parameter 'nrounds' was held constant at a value of 10000
          Tuning parameter 'gamma' was held constant at a
          value of 0
          Tuning parameter 'colsample_bytree' was held constant at a value of 1
          Tuning parameter
          'min_child_weight' was held constant at a value of 1
          Tuning parameter 'subsample' was held constant at a value of 1
          logLoss was used to select the optimal model using the smallest value.
          The final values used for the model were nrounds = 10000, max_depth = 2, eta
          = 0.1, gamma = 0, colsample_bytree =
          1, min_child_weight = 1 and subsample = 1.


          I hope this helps, and was what you were seeking. I am a bit suspicious if I did the log loss command correctly because it would appear max depth literally had no effect on log loss. I reran the model using a different metric, AUC, and the results showed no effect regardless of what was changed, and same with Cohen's Kappa. I'm guessing this is due to only ten samples, but hopefully somebody can actually explain what I did so this is more than just a code dump.






          share|improve this answer





















          • Three resamples. Also, not to get to grumpy, but @Ze4 doesn't give all the information. There are warnings that are generated ("You are trying to do regression and your outcome only has two possible values Are you trying to do classification? If so, use a 2 level factor as your outcome column.") that should help figure out out some of these issue. We don't know the versions, but I get an error "Error: The tuning parameter grid should have columns nrounds, max_depth, eta, gamma, colsample_bytree, min_child_weight, subsample" It would help to provide all the info before anyone sinks time.
            – topepo
            Aug 23 '17 at 1:17











          Your Answer






          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "1"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          convertImagesToLinks: true,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: 10,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














           

          draft saved


          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f45305636%2fxgbtree-caret-matrix-or-not%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown
























          1 Answer
          1






          active

          oldest

          votes








          1 Answer
          1






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes








          up vote
          0
          down vote



          accepted










          I realize I am kind of a noob when it comes to R/Caret/machine learning, but I saw your post after constantly checking for responses to my question and I managed to get your code working. I hope someone more knowledgeable is able to fully answer your questions, but in the meantime, here is what I did.



          First, I inputted your data set into R and tried running your code. I believe you may have a typo in your control function, you are missing a "c" in "cv" which may lead to the issues you are having with unused arguments.



          However, after I resolved that issue there were multiple errors and warnings; for one, you are using twoClassSummary but specifying logLoss (note the syntax here, it's not LogLoss in case that changes anything)...instead I switched this summaryFunction to mnlog to call the logLoss function properly, as from what I've read twoClassSummary uses AUC as its metric. Also, I replaced your "target" variable in your training set with a simple character variable, in this case, "Y" or "N". You can download the csv file here.



          After, I kept receiving an error regarding your tuning grid, stating that essentially your were missing tuning parameters for xgBoost methods which can be found in the documentation for caret (available models). I simply added the default values for the rest of the parameters (most of which were 1). The tuning grid I used can be found here.



          My final code I used to actually train the xgb model was as follows:



          control = trainControl(method = "repeatedcv", repeats = 1, number = 3, 
          summaryFunction = mnLogLoss,
          classProbs = TRUE,
          allowParallel=T)

          tune = train(x=set[,1:10], y=set[,11], method="xgbTree", trControl=control,
          tuneGrid = xgb.grid, verbose=TRUE, metric="logLoss", nthread=3)


          And the output is shown here:



          tune
          eXtreme Gradient Boosting

          10 samples
          10 predictors
          2 classes: 'N', 'Y'

          No pre-processing
          Resampling: Cross-Validated (3 fold, repeated 1 times)
          Summary of sample sizes: 6, 8, 6
          Resampling results across tuning parameters:

          eta max_depth logLoss
          0.01 2 0.6914816
          0.01 4 0.6914816
          0.01 6 0.6914816
          0.01 8 0.6914816
          0.01 10 0.6914816
          0.01 14 0.6914816
          0.05 2 0.6848399
          0.05 4 0.6848399
          0.05 6 0.6848399
          0.05 8 0.6848399
          0.05 10 0.6848399
          0.05 14 0.6848399
          0.10 2 0.6765847
          0.10 4 0.6765847
          0.10 6 0.6765847
          0.10 8 0.6765847
          0.10 10 0.6765847
          0.10 14 0.6765847

          Tuning parameter 'nrounds' was held constant at a value of 10000
          Tuning parameter 'gamma' was held constant at a
          value of 0
          Tuning parameter 'colsample_bytree' was held constant at a value of 1
          Tuning parameter
          'min_child_weight' was held constant at a value of 1
          Tuning parameter 'subsample' was held constant at a value of 1
          logLoss was used to select the optimal model using the smallest value.
          The final values used for the model were nrounds = 10000, max_depth = 2, eta
          = 0.1, gamma = 0, colsample_bytree =
          1, min_child_weight = 1 and subsample = 1.


          I hope this helps, and was what you were seeking. I am a bit suspicious if I did the log loss command correctly because it would appear max depth literally had no effect on log loss. I reran the model using a different metric, AUC, and the results showed no effect regardless of what was changed, and same with Cohen's Kappa. I'm guessing this is due to only ten samples, but hopefully somebody can actually explain what I did so this is more than just a code dump.






          share|improve this answer





















          • Three resamples. Also, not to get to grumpy, but @Ze4 doesn't give all the information. There are warnings that are generated ("You are trying to do regression and your outcome only has two possible values Are you trying to do classification? If so, use a 2 level factor as your outcome column.") that should help figure out out some of these issue. We don't know the versions, but I get an error "Error: The tuning parameter grid should have columns nrounds, max_depth, eta, gamma, colsample_bytree, min_child_weight, subsample" It would help to provide all the info before anyone sinks time.
            – topepo
            Aug 23 '17 at 1:17















          up vote
          0
          down vote



          accepted










          I realize I am kind of a noob when it comes to R/Caret/machine learning, but I saw your post after constantly checking for responses to my question and I managed to get your code working. I hope someone more knowledgeable is able to fully answer your questions, but in the meantime, here is what I did.



          First, I inputted your data set into R and tried running your code. I believe you may have a typo in your control function, you are missing a "c" in "cv" which may lead to the issues you are having with unused arguments.



          However, after I resolved that issue there were multiple errors and warnings; for one, you are using twoClassSummary but specifying logLoss (note the syntax here, it's not LogLoss in case that changes anything)...instead I switched this summaryFunction to mnlog to call the logLoss function properly, as from what I've read twoClassSummary uses AUC as its metric. Also, I replaced your "target" variable in your training set with a simple character variable, in this case, "Y" or "N". You can download the csv file here.



          After, I kept receiving an error regarding your tuning grid, stating that essentially your were missing tuning parameters for xgBoost methods which can be found in the documentation for caret (available models). I simply added the default values for the rest of the parameters (most of which were 1). The tuning grid I used can be found here.



          My final code I used to actually train the xgb model was as follows:



          control = trainControl(method = "repeatedcv", repeats = 1, number = 3, 
          summaryFunction = mnLogLoss,
          classProbs = TRUE,
          allowParallel=T)

          tune = train(x=set[,1:10], y=set[,11], method="xgbTree", trControl=control,
          tuneGrid = xgb.grid, verbose=TRUE, metric="logLoss", nthread=3)


          And the output is shown here:



          tune
          eXtreme Gradient Boosting

          10 samples
          10 predictors
          2 classes: 'N', 'Y'

          No pre-processing
          Resampling: Cross-Validated (3 fold, repeated 1 times)
          Summary of sample sizes: 6, 8, 6
          Resampling results across tuning parameters:

          eta max_depth logLoss
          0.01 2 0.6914816
          0.01 4 0.6914816
          0.01 6 0.6914816
          0.01 8 0.6914816
          0.01 10 0.6914816
          0.01 14 0.6914816
          0.05 2 0.6848399
          0.05 4 0.6848399
          0.05 6 0.6848399
          0.05 8 0.6848399
          0.05 10 0.6848399
          0.05 14 0.6848399
          0.10 2 0.6765847
          0.10 4 0.6765847
          0.10 6 0.6765847
          0.10 8 0.6765847
          0.10 10 0.6765847
          0.10 14 0.6765847

          Tuning parameter 'nrounds' was held constant at a value of 10000
          Tuning parameter 'gamma' was held constant at a
          value of 0
          Tuning parameter 'colsample_bytree' was held constant at a value of 1
          Tuning parameter
          'min_child_weight' was held constant at a value of 1
          Tuning parameter 'subsample' was held constant at a value of 1
          logLoss was used to select the optimal model using the smallest value.
          The final values used for the model were nrounds = 10000, max_depth = 2, eta
          = 0.1, gamma = 0, colsample_bytree =
          1, min_child_weight = 1 and subsample = 1.


          I hope this helps, and was what you were seeking. I am a bit suspicious if I did the log loss command correctly because it would appear max depth literally had no effect on log loss. I reran the model using a different metric, AUC, and the results showed no effect regardless of what was changed, and same with Cohen's Kappa. I'm guessing this is due to only ten samples, but hopefully somebody can actually explain what I did so this is more than just a code dump.






          share|improve this answer





















          • Three resamples. Also, not to get to grumpy, but @Ze4 doesn't give all the information. There are warnings that are generated ("You are trying to do regression and your outcome only has two possible values Are you trying to do classification? If so, use a 2 level factor as your outcome column.") that should help figure out out some of these issue. We don't know the versions, but I get an error "Error: The tuning parameter grid should have columns nrounds, max_depth, eta, gamma, colsample_bytree, min_child_weight, subsample" It would help to provide all the info before anyone sinks time.
            – topepo
            Aug 23 '17 at 1:17













          up vote
          0
          down vote



          accepted







          up vote
          0
          down vote



          accepted






          I realize I am kind of a noob when it comes to R/Caret/machine learning, but I saw your post after constantly checking for responses to my question and I managed to get your code working. I hope someone more knowledgeable is able to fully answer your questions, but in the meantime, here is what I did.



          First, I inputted your data set into R and tried running your code. I believe you may have a typo in your control function, you are missing a "c" in "cv" which may lead to the issues you are having with unused arguments.



          However, after I resolved that issue there were multiple errors and warnings; for one, you are using twoClassSummary but specifying logLoss (note the syntax here, it's not LogLoss in case that changes anything)...instead I switched this summaryFunction to mnlog to call the logLoss function properly, as from what I've read twoClassSummary uses AUC as its metric. Also, I replaced your "target" variable in your training set with a simple character variable, in this case, "Y" or "N". You can download the csv file here.



          After, I kept receiving an error regarding your tuning grid, stating that essentially your were missing tuning parameters for xgBoost methods which can be found in the documentation for caret (available models). I simply added the default values for the rest of the parameters (most of which were 1). The tuning grid I used can be found here.



          My final code I used to actually train the xgb model was as follows:



          control = trainControl(method = "repeatedcv", repeats = 1, number = 3, 
          summaryFunction = mnLogLoss,
          classProbs = TRUE,
          allowParallel=T)

          tune = train(x=set[,1:10], y=set[,11], method="xgbTree", trControl=control,
          tuneGrid = xgb.grid, verbose=TRUE, metric="logLoss", nthread=3)


          And the output is shown here:



          tune
          eXtreme Gradient Boosting

          10 samples
          10 predictors
          2 classes: 'N', 'Y'

          No pre-processing
          Resampling: Cross-Validated (3 fold, repeated 1 times)
          Summary of sample sizes: 6, 8, 6
          Resampling results across tuning parameters:

          eta max_depth logLoss
          0.01 2 0.6914816
          0.01 4 0.6914816
          0.01 6 0.6914816
          0.01 8 0.6914816
          0.01 10 0.6914816
          0.01 14 0.6914816
          0.05 2 0.6848399
          0.05 4 0.6848399
          0.05 6 0.6848399
          0.05 8 0.6848399
          0.05 10 0.6848399
          0.05 14 0.6848399
          0.10 2 0.6765847
          0.10 4 0.6765847
          0.10 6 0.6765847
          0.10 8 0.6765847
          0.10 10 0.6765847
          0.10 14 0.6765847

          Tuning parameter 'nrounds' was held constant at a value of 10000
          Tuning parameter 'gamma' was held constant at a
          value of 0
          Tuning parameter 'colsample_bytree' was held constant at a value of 1
          Tuning parameter
          'min_child_weight' was held constant at a value of 1
          Tuning parameter 'subsample' was held constant at a value of 1
          logLoss was used to select the optimal model using the smallest value.
          The final values used for the model were nrounds = 10000, max_depth = 2, eta
          = 0.1, gamma = 0, colsample_bytree =
          1, min_child_weight = 1 and subsample = 1.


          I hope this helps, and was what you were seeking. I am a bit suspicious if I did the log loss command correctly because it would appear max depth literally had no effect on log loss. I reran the model using a different metric, AUC, and the results showed no effect regardless of what was changed, and same with Cohen's Kappa. I'm guessing this is due to only ten samples, but hopefully somebody can actually explain what I did so this is more than just a code dump.






          share|improve this answer












          I realize I am kind of a noob when it comes to R/Caret/machine learning, but I saw your post after constantly checking for responses to my question and I managed to get your code working. I hope someone more knowledgeable is able to fully answer your questions, but in the meantime, here is what I did.



          First, I inputted your data set into R and tried running your code. I believe you may have a typo in your control function, you are missing a "c" in "cv" which may lead to the issues you are having with unused arguments.



          However, after I resolved that issue there were multiple errors and warnings; for one, you are using twoClassSummary but specifying logLoss (note the syntax here, it's not LogLoss in case that changes anything)...instead I switched this summaryFunction to mnlog to call the logLoss function properly, as from what I've read twoClassSummary uses AUC as its metric. Also, I replaced your "target" variable in your training set with a simple character variable, in this case, "Y" or "N". You can download the csv file here.



          After, I kept receiving an error regarding your tuning grid, stating that essentially your were missing tuning parameters for xgBoost methods which can be found in the documentation for caret (available models). I simply added the default values for the rest of the parameters (most of which were 1). The tuning grid I used can be found here.



          My final code I used to actually train the xgb model was as follows:



          control = trainControl(method = "repeatedcv", repeats = 1, number = 3, 
          summaryFunction = mnLogLoss,
          classProbs = TRUE,
          allowParallel=T)

          tune = train(x=set[,1:10], y=set[,11], method="xgbTree", trControl=control,
          tuneGrid = xgb.grid, verbose=TRUE, metric="logLoss", nthread=3)


          And the output is shown here:



          tune
          eXtreme Gradient Boosting

          10 samples
          10 predictors
          2 classes: 'N', 'Y'

          No pre-processing
          Resampling: Cross-Validated (3 fold, repeated 1 times)
          Summary of sample sizes: 6, 8, 6
          Resampling results across tuning parameters:

          eta max_depth logLoss
          0.01 2 0.6914816
          0.01 4 0.6914816
          0.01 6 0.6914816
          0.01 8 0.6914816
          0.01 10 0.6914816
          0.01 14 0.6914816
          0.05 2 0.6848399
          0.05 4 0.6848399
          0.05 6 0.6848399
          0.05 8 0.6848399
          0.05 10 0.6848399
          0.05 14 0.6848399
          0.10 2 0.6765847
          0.10 4 0.6765847
          0.10 6 0.6765847
          0.10 8 0.6765847
          0.10 10 0.6765847
          0.10 14 0.6765847

          Tuning parameter 'nrounds' was held constant at a value of 10000
          Tuning parameter 'gamma' was held constant at a
          value of 0
          Tuning parameter 'colsample_bytree' was held constant at a value of 1
          Tuning parameter
          'min_child_weight' was held constant at a value of 1
          Tuning parameter 'subsample' was held constant at a value of 1
          logLoss was used to select the optimal model using the smallest value.
          The final values used for the model were nrounds = 10000, max_depth = 2, eta
          = 0.1, gamma = 0, colsample_bytree =
          1, min_child_weight = 1 and subsample = 1.


          I hope this helps, and was what you were seeking. I am a bit suspicious if I did the log loss command correctly because it would appear max depth literally had no effect on log loss. I reran the model using a different metric, AUC, and the results showed no effect regardless of what was changed, and same with Cohen's Kappa. I'm guessing this is due to only ten samples, but hopefully somebody can actually explain what I did so this is more than just a code dump.







          share|improve this answer












          share|improve this answer



          share|improve this answer










          answered Jul 26 '17 at 3:01









          aranglol

          163




          163












          • Three resamples. Also, not to get to grumpy, but @Ze4 doesn't give all the information. There are warnings that are generated ("You are trying to do regression and your outcome only has two possible values Are you trying to do classification? If so, use a 2 level factor as your outcome column.") that should help figure out out some of these issue. We don't know the versions, but I get an error "Error: The tuning parameter grid should have columns nrounds, max_depth, eta, gamma, colsample_bytree, min_child_weight, subsample" It would help to provide all the info before anyone sinks time.
            – topepo
            Aug 23 '17 at 1:17


















          • Three resamples. Also, not to get to grumpy, but @Ze4 doesn't give all the information. There are warnings that are generated ("You are trying to do regression and your outcome only has two possible values Are you trying to do classification? If so, use a 2 level factor as your outcome column.") that should help figure out out some of these issue. We don't know the versions, but I get an error "Error: The tuning parameter grid should have columns nrounds, max_depth, eta, gamma, colsample_bytree, min_child_weight, subsample" It would help to provide all the info before anyone sinks time.
            – topepo
            Aug 23 '17 at 1:17
















          Three resamples. Also, not to get to grumpy, but @Ze4 doesn't give all the information. There are warnings that are generated ("You are trying to do regression and your outcome only has two possible values Are you trying to do classification? If so, use a 2 level factor as your outcome column.") that should help figure out out some of these issue. We don't know the versions, but I get an error "Error: The tuning parameter grid should have columns nrounds, max_depth, eta, gamma, colsample_bytree, min_child_weight, subsample" It would help to provide all the info before anyone sinks time.
          – topepo
          Aug 23 '17 at 1:17




          Three resamples. Also, not to get to grumpy, but @Ze4 doesn't give all the information. There are warnings that are generated ("You are trying to do regression and your outcome only has two possible values Are you trying to do classification? If so, use a 2 level factor as your outcome column.") that should help figure out out some of these issue. We don't know the versions, but I get an error "Error: The tuning parameter grid should have columns nrounds, max_depth, eta, gamma, colsample_bytree, min_child_weight, subsample" It would help to provide all the info before anyone sinks time.
          – topepo
          Aug 23 '17 at 1:17


















           

          draft saved


          draft discarded



















































           


          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f45305636%2fxgbtree-caret-matrix-or-not%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          Guess what letter conforming each word

          Run scheduled task as local user group (not BUILTIN)

          Port of Spain