class: center, middle, title-slide # Building a package that fits into an evolving ecosystem ## OCRUG ### Emil Hvitfeldt ### 2019-11-19 --- class: listfillpage # Overview .listfill[ - Open Source Software Development - My journey - What I learned ] This talk is based on anecdotes, I dearly hope that they generalize! --- # As a new developer it can be hard to find problems that are: - Easy enough for you do - Prominent enough that they are worth solving ## My advice - Keep list of ideas - Google early and often about implementations Working on a implentation can still be fruitful even if it doesn't end up on CRAN. --- .center[ data:image/s3,"s3://crabby-images/ce4a4/ce4a4b353a9a6dd10e7e54cf2f26773c63dd9a0c" alt=":scale 80%" ] --- .center[ data:image/s3,"s3://crabby-images/0decc/0decc5bbf8959450beaf8c3d31a423bd19fab869" alt=":scale 90%" ] --- # Look for Github tags data:image/s3,"s3://crabby-images/57657/57657948e24df2cb9918150280f0bbd2662e1c6c" alt="" data:image/s3,"s3://crabby-images/27d2c/27d2c7a0b7c28c3beed469e6bb18cb1de320f429" alt="" data:image/s3,"s3://crabby-images/3be46/3be46e39a11087d8acf0413a21d1751ee7099053" alt=":scale 70%" --- data:image/s3,"s3://crabby-images/9463c/9463c36432c98aa3dc6c2f40404faafa5e9d71f7" alt="" --- # Ask before starting the work Makes sure that: - you are not doing the same work as someone else - the work is wanted --- .center[ data:image/s3,"s3://crabby-images/c1679/c167904853c4f64b72963b94feee5e2cc04fc7e8" alt=":scale 85%" ] Artwork by @allison_horst --- # Github Projects .center[ data:image/s3,"s3://crabby-images/30e2b/30e2b6de5e6e95829a3b2ce8d1810f60ab909274" alt="" ] --- .center[ data:image/s3,"s3://crabby-images/6976f/6976f3addfa9b0750867ac28029d3842220f439b" alt=":scale 80%" ] --- data:image/s3,"s3://crabby-images/15322/15322a1c43b9266a9fd11994728d52d5dcf637dd" alt="" --- class: no-padding data:image/s3,"s3://crabby-images/39e9e/39e9e7d02e5bc330670b16cfdbdb4353a8caf697" alt="" --- # Challenges - Inheret messy structure of text - steps don't have specified order -- # Design choice ## Flexibily > speed --- data:image/s3,"s3://crabby-images/e3770/e3770c571bd3a0a7819b75f42c698646b65a3f73" alt="" --- <style type="text/css"> .code40 { font-size: 40%; } </style> .pull-left[ .code40[ ```r step_stem <- function(recipe, ..., role = NA, trained = FALSE, columns = NULL, options = list(), custom_stemmer = NULL, skip = FALSE, id = rand_id("stem") ) { add_step( recipe, step_stem_new( terms = ellipse_check(...), role = role, trained = trained, options = options, custom_stemmer = custom_stemmer, columns = columns, skip = skip, id = id ) ) } step_stem_new <- function(terms, role, trained, columns, options, custom_stemmer, skip, id) { step( subclass = "stem", terms = terms, role = role, trained = trained, columns = columns, options = options, custom_stemmer = custom_stemmer, skip = skip, id = id ) } prep.step_stem <- function(x, training, info = NULL, ...) { col_names <- terms_select(x$terms, info = info) check_list(training[, col_names]) step_stem_new( terms = x$terms, role = x$role, trained = TRUE, columns = col_names, options = x$options, custom_stemmer = x$custom_stemmer, skip = x$skip, id = x$id ) } ``` ] ] .pull-right[ .code40[ ```r bake.step_stem <- function(object, new_data, ...) { col_names <- object$columns stem_fun <- object$custom_stemmer %||% SnowballC::wordStem for (i in seq_along(col_names)) { stemmed_text <- map(new_data[, col_names[i], drop = TRUE], stem_fun) new_data[, col_names[i]] <- tibble(stemmed_text) } new_data <- factor_to_text(new_data, col_names) as_tibble(new_data) } print.step_stem <- function(x, width = max(20, options()$width - 30), ...) { cat("Stemming for ", sep = "") printer(x$columns, x$terms, x$trained, width = width) invisible(x) } tidy.step_stem <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = x$terms, is_custom_stemmer = is.null(x$custom_stemmer)) } else { term_names <- sel2char(x$terms) res <- tibble(terms = term_names, value = na_chr) } res$id <- x$id res } ``` ] ] --- <style type="text/css"> .code40 { font-size: 40%; } </style> .pull-left[ .code40[ ```r step_stem <- function(recipe, ..., role = NA, trained = FALSE, columns = NULL, options = list(), custom_stemmer = NULL, skip = FALSE, id = rand_id("stem") ) { add_step( recipe, step_stem_new( terms = ellipse_check(...), role = role, trained = trained, options = options, custom_stemmer = custom_stemmer, columns = columns, skip = skip, id = id ) ) } step_stem_new <- function(terms, role, trained, columns, options, custom_stemmer, skip, id) { step( subclass = "stem", terms = terms, role = role, trained = trained, columns = columns, options = options, custom_stemmer = custom_stemmer, skip = skip, id = id ) } prep.step_stem <- function(x, training, info = NULL, ...) { col_names <- terms_select(x$terms, info = info) check_list(training[, col_names]) step_stem_new( terms = x$terms, role = x$role, trained = TRUE, columns = col_names, options = x$options, custom_stemmer = x$custom_stemmer, skip = x$skip, id = x$id ) } ``` ] ] .pull-right[ .code40[ ```r bake.step_stem <- function(object, new_data, ...) { * col_names <- object$columns * * stem_fun <- object$custom_stemmer %||% * SnowballC::wordStem * * for (i in seq_along(col_names)) { * stemmed_text <- map(new_data[, col_names[i], drop = TRUE], * stem_fun) * * new_data[, col_names[i]] <- tibble(stemmed_text) * } * new_data <- factor_to_text(new_data, col_names) * as_tibble(new_data) } print.step_stem <- function(x, width = max(20, options()$width - 30), ...) { cat("Stemming for ", sep = "") printer(x$columns, x$terms, x$trained, width = width) invisible(x) } tidy.step_stem <- function(x, ...) { if (is_trained(x)) { res <- tibble(terms = x$terms, is_custom_stemmer = is.null(x$custom_stemmer)) } else { term_names <- sel2char(x$terms) res <- tibble(terms = term_names, value = na_chr) } res$id <- x$id res } ``` ] ] --- # The bake step ```r bake.step_stem <- function(object, new_data, ...) { col_names <- object$columns stem_fun <- object$custom_stemmer %||% SnowballC::wordStem for (i in seq_along(col_names)) { stemmed_text <- map(new_data[, col_names[i], drop = TRUE], stem_fun) new_data[, col_names[i]] <- tibble(stemmed_text) } new_data <- factor_to_text(new_data, col_names) as_tibble(new_data) } ``` --- # the traceback .pull-left[ data:image/s3,"s3://crabby-images/d6aa3/d6aa3f53f30bef1b69c41cf867b5b5e53b3588c9" alt=":scale 100%" ] .pull-right[ - I call `prep()` - `prep()` calls `prep.recipe()` - `prep.recipe()` calls `bake()` in a loop - `bake()` calls `bake.step_stem()` quite a few levels deep. ] --- # Browser to the rescue data:image/s3,"s3://crabby-images/ebd47/ebd47388dc637d39b6ddbeddfafa036a66a93ebf" alt=":scale 80%" --- # Plenty of follow up data:image/s3,"s3://crabby-images/13886/13886b22f3381242a75eee7f69cc3938e92c89c1" alt="" --- # Don't reinvent the wheel textrecipes stands on the shoulders of - recipe (obviously) - tokenizers - SnowballC - stopwords - text2vec - textfeatures --- # My biggest git mistake data:image/s3,"s3://crabby-images/87ffe/87ffe85e835a8a0505cc278b4c74caa705e16113" alt="" --- # The reward - joined my first organization data:image/s3,"s3://crabby-images/3c401/3c4016da723613d1617e872d3685d43c3bb9f12e" alt="" --- # The reward - CRAN releases data:image/s3,"s3://crabby-images/f082a/f082a932cedef859bbee989fd9b5c40a4cc1c0bc" alt=":scale 70%" --- # The reward - Post on tidyverse.org data:image/s3,"s3://crabby-images/c42d4/c42d447e231c9b3ccdb0460cce45880f9f94fe56" alt="" --- class: center, middle # Thank you! ###
[EmilHvitfeldt](https://github.com/EmilHvitfeldt/) ###
[@Emil_Hvitfeldt](https://twitter.com/Emil_Hvitfeldt) ###
[emilhvitfeldt](linkedin.com/in/emilhvitfeldt/) ###
[www.hvitfeldt.me](www.hvitfeldt.me) Slides created via the R package [xaringan](https://github.com/yihui/xaringan).