Map specific columns to a function that takes in two arguments

After running k-means() on using different numbers of k = [2,3,4,5] on the iris dataset using the map() function, I would like to interpret the results for different k using a predefined function.

Below is my attempt:

library(dplyr)

library(purrr)



cluster_assignment <- map(2:5, function(k){

  result <- kmeans((x = iris[-5] %>%

                          scale()),

                    centers = k)



  # # return results to a list

  x <- list(result$cluster,

            result$tot.withinss,

            result$centers,

            result$size)

})





# assign cluster results back to the iris dataset

a <- map_dfc(cluster_assignment, 1)

colnames(a) <- paste0("result_", 2:5, "_cl")

iris <- bind_cols(iris, a)



> head(iris)

  Sepal.Length Sepal.Width Petal.Length Petal.Width Species result_2_cl result_3_cl result_4_cl result_5_cl

1          5.1         3.5          1.4         0.2  setosa           2           2           3           3

2          4.9         3.0          1.4         0.2  setosa           2           1           3           2

3          4.7         3.2          1.3         0.2  setosa           2           1           3           2

4          4.6         3.1          1.5         0.2  setosa           2           1           3           2

5          5.0         3.6          1.4         0.2  setosa           2           2           3           3

6          5.4         3.9          1.7         0.4  setosa           2           2           3           5

Now, I would apply a predefined function cluster_result2 to the newly assigned columns, i.e "result_2_cl", "result_3_cl", "result_4_cl", "result_5_cl"

# predefined function

cluster_result2 <- function(x, ...){

 x %>%

   group_by_(...) %>%

   summarise(size = n(),

             mean_spl = mean(Sepal.Length))

}



# tried this method, but did not get the expected output

map(iris[, colnames(a)], ~ cluster_result2(iris, .x))

How can I achieve this using the tidyverse approach? I found a very similar approach here, but couldn't get the expected output.

The expected output will be similar to the ones below, except they are stored in a nested list/dataframe:

> cluster_result2(iris, colnames(a)[1])

# A tibble: 2 x 3

  result_2_cl  size mean_spl

        <int> <int>    <dbl>

1           1   100     6.26

2           2    50     5.01

> cluster_result2(iris, colnames(a)[2])

# A tibble: 3 x 3

  result_3_cl  size mean_spl

        <int> <int>    <dbl>

1           1    21     4.75

2           2    33     5.17

3           3    96     6.31



  > cluster_result2(iris, colnames(a)[3])

# A tibble: 4 x 3

  result_4_cl  size mean_spl

        <int> <int>    <dbl>

1           1    29     7.00

2           2    50     6.14

3           3    49     5.02

4           4    22     5.50



> cluster_result2(iris, colnames(a)[4])

# A tibble: 5 x 3

  result_5_cl  size mean_spl

        <int> <int>    <dbl>

1           1    47     6.78

2           2    17     4.69

3           3    26     5.07

4           4    53     5.80

5           5     7     5.53

Appreciate your answers!

edited Nov 21 '18 at 5:03

asked Nov 21 '18 at 4:02

jacky_learns_to_code

4852620

add a comment |

Below is my attempt:

library(dplyr)

library(purrr)



cluster_assignment <- map(2:5, function(k){

  result <- kmeans((x = iris[-5] %>%

                          scale()),

                    centers = k)



  # # return results to a list

  x <- list(result$cluster,

            result$tot.withinss,

            result$centers,

            result$size)

})





# assign cluster results back to the iris dataset

a <- map_dfc(cluster_assignment, 1)

colnames(a) <- paste0("result_", 2:5, "_cl")

iris <- bind_cols(iris, a)



> head(iris)

  Sepal.Length Sepal.Width Petal.Length Petal.Width Species result_2_cl result_3_cl result_4_cl result_5_cl

1          5.1         3.5          1.4         0.2  setosa           2           2           3           3

2          4.9         3.0          1.4         0.2  setosa           2           1           3           2

3          4.7         3.2          1.3         0.2  setosa           2           1           3           2

4          4.6         3.1          1.5         0.2  setosa           2           1           3           2

5          5.0         3.6          1.4         0.2  setosa           2           2           3           3

6          5.4         3.9          1.7         0.4  setosa           2           2           3           5

Now, I would apply a predefined function cluster_result2 to the newly assigned columns, i.e "result_2_cl", "result_3_cl", "result_4_cl", "result_5_cl"

# predefined function

cluster_result2 <- function(x, ...){

 x %>%

   group_by_(...) %>%

   summarise(size = n(),

             mean_spl = mean(Sepal.Length))

}



# tried this method, but did not get the expected output

map(iris[, colnames(a)], ~ cluster_result2(iris, .x))

How can I achieve this using the tidyverse approach? I found a very similar approach here, but couldn't get the expected output.

The expected output will be similar to the ones below, except they are stored in a nested list/dataframe:

> cluster_result2(iris, colnames(a)[1])

# A tibble: 2 x 3

  result_2_cl  size mean_spl

        <int> <int>    <dbl>

1           1   100     6.26

2           2    50     5.01

> cluster_result2(iris, colnames(a)[2])

# A tibble: 3 x 3

  result_3_cl  size mean_spl

        <int> <int>    <dbl>

1           1    21     4.75

2           2    33     5.17

3           3    96     6.31



  > cluster_result2(iris, colnames(a)[3])

# A tibble: 4 x 3

  result_4_cl  size mean_spl

        <int> <int>    <dbl>

1           1    29     7.00

2           2    50     6.14

3           3    49     5.02

4           4    22     5.50



> cluster_result2(iris, colnames(a)[4])

# A tibble: 5 x 3

  result_5_cl  size mean_spl

        <int> <int>    <dbl>

1           1    47     6.78

2           2    17     4.69

3           3    26     5.07

4           4    53     5.80

5           5     7     5.53

Appreciate your answers!

edited Nov 21 '18 at 5:03

asked Nov 21 '18 at 4:02

jacky_learns_to_code

4852620

add a comment |

Below is my attempt:

library(dplyr)

library(purrr)



cluster_assignment <- map(2:5, function(k){

  result <- kmeans((x = iris[-5] %>%

                          scale()),

                    centers = k)



  # # return results to a list

  x <- list(result$cluster,

            result$tot.withinss,

            result$centers,

            result$size)

})





# assign cluster results back to the iris dataset

a <- map_dfc(cluster_assignment, 1)

colnames(a) <- paste0("result_", 2:5, "_cl")

iris <- bind_cols(iris, a)



> head(iris)

  Sepal.Length Sepal.Width Petal.Length Petal.Width Species result_2_cl result_3_cl result_4_cl result_5_cl

1          5.1         3.5          1.4         0.2  setosa           2           2           3           3

2          4.9         3.0          1.4         0.2  setosa           2           1           3           2

3          4.7         3.2          1.3         0.2  setosa           2           1           3           2

4          4.6         3.1          1.5         0.2  setosa           2           1           3           2

5          5.0         3.6          1.4         0.2  setosa           2           2           3           3

6          5.4         3.9          1.7         0.4  setosa           2           2           3           5

Now, I would apply a predefined function cluster_result2 to the newly assigned columns, i.e "result_2_cl", "result_3_cl", "result_4_cl", "result_5_cl"

# predefined function

cluster_result2 <- function(x, ...){

 x %>%

   group_by_(...) %>%

   summarise(size = n(),

             mean_spl = mean(Sepal.Length))

}



# tried this method, but did not get the expected output

map(iris[, colnames(a)], ~ cluster_result2(iris, .x))

How can I achieve this using the tidyverse approach? I found a very similar approach here, but couldn't get the expected output.

The expected output will be similar to the ones below, except they are stored in a nested list/dataframe:

> cluster_result2(iris, colnames(a)[1])

# A tibble: 2 x 3

  result_2_cl  size mean_spl

        <int> <int>    <dbl>

1           1   100     6.26

2           2    50     5.01

> cluster_result2(iris, colnames(a)[2])

# A tibble: 3 x 3

  result_3_cl  size mean_spl

        <int> <int>    <dbl>

1           1    21     4.75

2           2    33     5.17

3           3    96     6.31



  > cluster_result2(iris, colnames(a)[3])

# A tibble: 4 x 3

  result_4_cl  size mean_spl

        <int> <int>    <dbl>

1           1    29     7.00

2           2    50     6.14

3           3    49     5.02

4           4    22     5.50



> cluster_result2(iris, colnames(a)[4])

# A tibble: 5 x 3

  result_5_cl  size mean_spl

        <int> <int>    <dbl>

1           1    47     6.78

2           2    17     4.69

3           3    26     5.07

4           4    53     5.80

5           5     7     5.53

Appreciate your answers!

edited Nov 21 '18 at 5:03

asked Nov 21 '18 at 4:02

jacky_learns_to_code

4852620

Below is my attempt:

library(dplyr)

library(purrr)



cluster_assignment <- map(2:5, function(k){

  result <- kmeans((x = iris[-5] %>%

                          scale()),

                    centers = k)



  # # return results to a list

  x <- list(result$cluster,

            result$tot.withinss,

            result$centers,

            result$size)

})





# assign cluster results back to the iris dataset

a <- map_dfc(cluster_assignment, 1)

colnames(a) <- paste0("result_", 2:5, "_cl")

iris <- bind_cols(iris, a)



> head(iris)

  Sepal.Length Sepal.Width Petal.Length Petal.Width Species result_2_cl result_3_cl result_4_cl result_5_cl

1          5.1         3.5          1.4         0.2  setosa           2           2           3           3

2          4.9         3.0          1.4         0.2  setosa           2           1           3           2

3          4.7         3.2          1.3         0.2  setosa           2           1           3           2

4          4.6         3.1          1.5         0.2  setosa           2           1           3           2

5          5.0         3.6          1.4         0.2  setosa           2           2           3           3

6          5.4         3.9          1.7         0.4  setosa           2           2           3           5

Now, I would apply a predefined function cluster_result2 to the newly assigned columns, i.e "result_2_cl", "result_3_cl", "result_4_cl", "result_5_cl"

# predefined function

cluster_result2 <- function(x, ...){

 x %>%

   group_by_(...) %>%

   summarise(size = n(),

             mean_spl = mean(Sepal.Length))

}



# tried this method, but did not get the expected output

map(iris[, colnames(a)], ~ cluster_result2(iris, .x))

How can I achieve this using the tidyverse approach? I found a very similar approach here, but couldn't get the expected output.

The expected output will be similar to the ones below, except they are stored in a nested list/dataframe:

> cluster_result2(iris, colnames(a)[1])

# A tibble: 2 x 3

  result_2_cl  size mean_spl

        <int> <int>    <dbl>

1           1   100     6.26

2           2    50     5.01

> cluster_result2(iris, colnames(a)[2])

# A tibble: 3 x 3

  result_3_cl  size mean_spl

        <int> <int>    <dbl>

1           1    21     4.75

2           2    33     5.17

3           3    96     6.31



  > cluster_result2(iris, colnames(a)[3])

# A tibble: 4 x 3

  result_4_cl  size mean_spl

        <int> <int>    <dbl>

1           1    29     7.00

2           2    50     6.14

3           3    49     5.02

4           4    22     5.50



> cluster_result2(iris, colnames(a)[4])

# A tibble: 5 x 3

  result_5_cl  size mean_spl

        <int> <int>    <dbl>

1           1    47     6.78

2           2    17     4.69

3           3    26     5.07

4           4    53     5.80

5           5     7     5.53

Appreciate your answers!

r tidyverse purrr

edited Nov 21 '18 at 5:03

asked Nov 21 '18 at 4:02

jacky_learns_to_code

4852620

edited Nov 21 '18 at 5:03

asked Nov 21 '18 at 4:02

jacky_learns_to_code

4852620

edited Nov 21 '18 at 5:03

asked Nov 21 '18 at 4:02

jacky_learns_to_code

4852620

asked Nov 21 '18 at 4:02

jacky_learns_to_code

4852620

asked Nov 21 '18 at 4:02

jacky_learns_to_code

4852620

add a comment |

1 Answer
1

active

oldest

votes

We can use group_by_at instead of group_by_ (it is deprecated). Here, we need to loop through the column names of 'a' instead of the columns of 'iris'

library(tidyverse)

map(colnames(a), ~ cluster_result2(iris, .x))

Or without using the ~, specify the 'x' parameter as 'iris'

map(colnames(a), cluster_result2, x = iris)

#[[1]]

# A tibble: 2 x 3

#  result_2_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    50     5.01

#2           2   100     6.26



#[[2]]

# A tibble: 3 x 3

#  result_3_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    47     6.78

#2           2    53     5.80

#3           3    50     5.01



#[[3]]

# A tibble: 4 x 3

#  result_4_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    50     6.14

#2           2    22     5.50

#3           3    29     7.00

#4           4    49     5.02



#[[4]]

# A tibble: 5 x 3

#  result_5_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    16     5.32

#2           2    29     7.00

#3           3    23     5.55

#4           4    34     4.86

#5           5    48     6.16

-checking with the output of function individually applied to columns

cluster_result2(iris, colnames(a)[4])

# A tibble: 5 x 3

#  result_5_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    16     5.32

#2           2    29     7.00

#3           3    23     5.55

#4           4    34     4.86

#5           5    48     6.16

NOTE: The output will be slightly different due to the randomness

edited Nov 21 '18 at 5:17

answered Nov 21 '18 at 5:11

akrun

414k13202275

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53405096%2fmap-specific-columns-to-a-function-that-takes-in-two-arguments%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

We can use group_by_at instead of group_by_ (it is deprecated). Here, we need to loop through the column names of 'a' instead of the columns of 'iris'

library(tidyverse)

map(colnames(a), ~ cluster_result2(iris, .x))

Or without using the ~, specify the 'x' parameter as 'iris'

map(colnames(a), cluster_result2, x = iris)

#[[1]]

# A tibble: 2 x 3

#  result_2_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    50     5.01

#2           2   100     6.26



#[[2]]

# A tibble: 3 x 3

#  result_3_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    47     6.78

#2           2    53     5.80

#3           3    50     5.01



#[[3]]

# A tibble: 4 x 3

#  result_4_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    50     6.14

#2           2    22     5.50

#3           3    29     7.00

#4           4    49     5.02



#[[4]]

# A tibble: 5 x 3

#  result_5_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    16     5.32

#2           2    29     7.00

#3           3    23     5.55

#4           4    34     4.86

#5           5    48     6.16

-checking with the output of function individually applied to columns

cluster_result2(iris, colnames(a)[4])

# A tibble: 5 x 3

#  result_5_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    16     5.32

#2           2    29     7.00

#3           3    23     5.55

#4           4    34     4.86

#5           5    48     6.16

NOTE: The output will be slightly different due to the randomness

edited Nov 21 '18 at 5:17

answered Nov 21 '18 at 5:11

akrun

414k13202275

add a comment |

We can use group_by_at instead of group_by_ (it is deprecated). Here, we need to loop through the column names of 'a' instead of the columns of 'iris'

library(tidyverse)

map(colnames(a), ~ cluster_result2(iris, .x))

Or without using the ~, specify the 'x' parameter as 'iris'

map(colnames(a), cluster_result2, x = iris)

#[[1]]

# A tibble: 2 x 3

#  result_2_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    50     5.01

#2           2   100     6.26



#[[2]]

# A tibble: 3 x 3

#  result_3_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    47     6.78

#2           2    53     5.80

#3           3    50     5.01



#[[3]]

# A tibble: 4 x 3

#  result_4_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    50     6.14

#2           2    22     5.50

#3           3    29     7.00

#4           4    49     5.02



#[[4]]

# A tibble: 5 x 3

#  result_5_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    16     5.32

#2           2    29     7.00

#3           3    23     5.55

#4           4    34     4.86

#5           5    48     6.16

-checking with the output of function individually applied to columns

cluster_result2(iris, colnames(a)[4])

# A tibble: 5 x 3

#  result_5_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    16     5.32

#2           2    29     7.00

#3           3    23     5.55

#4           4    34     4.86

#5           5    48     6.16

NOTE: The output will be slightly different due to the randomness

edited Nov 21 '18 at 5:17

answered Nov 21 '18 at 5:11

akrun

414k13202275

add a comment |

We can use group_by_at instead of group_by_ (it is deprecated). Here, we need to loop through the column names of 'a' instead of the columns of 'iris'

library(tidyverse)

map(colnames(a), ~ cluster_result2(iris, .x))

Or without using the ~, specify the 'x' parameter as 'iris'

map(colnames(a), cluster_result2, x = iris)

#[[1]]

# A tibble: 2 x 3

#  result_2_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    50     5.01

#2           2   100     6.26



#[[2]]

# A tibble: 3 x 3

#  result_3_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    47     6.78

#2           2    53     5.80

#3           3    50     5.01



#[[3]]

# A tibble: 4 x 3

#  result_4_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    50     6.14

#2           2    22     5.50

#3           3    29     7.00

#4           4    49     5.02



#[[4]]

# A tibble: 5 x 3

#  result_5_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    16     5.32

#2           2    29     7.00

#3           3    23     5.55

#4           4    34     4.86

#5           5    48     6.16

-checking with the output of function individually applied to columns

cluster_result2(iris, colnames(a)[4])

# A tibble: 5 x 3

#  result_5_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    16     5.32

#2           2    29     7.00

#3           3    23     5.55

#4           4    34     4.86

#5           5    48     6.16

NOTE: The output will be slightly different due to the randomness

edited Nov 21 '18 at 5:17

answered Nov 21 '18 at 5:11

akrun

414k13202275

We can use group_by_at instead of group_by_ (it is deprecated). Here, we need to loop through the column names of 'a' instead of the columns of 'iris'

library(tidyverse)

map(colnames(a), ~ cluster_result2(iris, .x))

Or without using the ~, specify the 'x' parameter as 'iris'

map(colnames(a), cluster_result2, x = iris)

#[[1]]

# A tibble: 2 x 3

#  result_2_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    50     5.01

#2           2   100     6.26



#[[2]]

# A tibble: 3 x 3

#  result_3_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    47     6.78

#2           2    53     5.80

#3           3    50     5.01



#[[3]]

# A tibble: 4 x 3

#  result_4_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    50     6.14

#2           2    22     5.50

#3           3    29     7.00

#4           4    49     5.02



#[[4]]

# A tibble: 5 x 3

#  result_5_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    16     5.32

#2           2    29     7.00

#3           3    23     5.55

#4           4    34     4.86

#5           5    48     6.16

-checking with the output of function individually applied to columns

cluster_result2(iris, colnames(a)[4])

# A tibble: 5 x 3

#  result_5_cl  size mean_spl

#        <int> <int>    <dbl>

#1           1    16     5.32

#2           2    29     7.00

#3           3    23     5.55

#4           4    34     4.86

#5           5    48     6.16

NOTE: The output will be slightly different due to the randomness

edited Nov 21 '18 at 5:17

answered Nov 21 '18 at 5:11

akrun

414k13202275

edited Nov 21 '18 at 5:17

answered Nov 21 '18 at 5:11

akrun

414k13202275

answered Nov 21 '18 at 5:11

akrun

414k13202275

answered Nov 21 '18 at 5:11

akrun

414k13202275

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Agfdhyk