## Replication materials for all results from the manuscript: ## stringi: Fast and Portable Character String Processing in R ## ## by Marek Gagolewski, 2022-02-28 ## https://stringi.gagolewski.com ## ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ## ***************************************************************************** ## *********************************** Section 1 ******************************* ## ***************************************************************************** ## ----------------------------------------------------------------------------- library("stringi") cat(stri_info(short = TRUE)) ## ***************************************************************************** ## *********************************** Section 2 ******************************* ## ***************************************************************************** ## ----------------------------------------------------------------------------- library("xml2") library("rvest") ## ----------------------------------------------------------------------------- f <- read_html("20200917_wikipedia_melbourne.html") ## ----------------------------------------------------------------------------- all_tables <- html_nodes(f, "table") text_tables <- sapply(all_tables, html_text) str(text_tables, nchar.max = 65, vec.len = 5, strict.width = "wrap") # preview ## ----------------------------------------------------------------------------- library("stringi") (idx <- which(stri_detect_fixed(text_tables, "climate data", case_insensitive = TRUE, max_count = 1))) ## ----------------------------------------------------------------------------- (x <- as.data.frame(html_table(all_tables[[idx]], fill = TRUE))) ## ----------------------------------------------------------------------------- x <- as.matrix(x) ## ----------------------------------------------------------------------------- x[, ] <- stri_trans_general(x, "Publishing-Any; Any-ASCII") ## ----------------------------------------------------------------------------- dimnames(x) <- list(x[, 1], x[1, ]) # row, column names x <- x[2:(nrow(x)-1), 2:ncol(x)] # skip 1st/last row and 1st column x[, c(1, ncol(x))] # example columns ## ----------------------------------------------------------------------------- x[, ] <- stri_replace_all_regex(x, "(?<=\\d),(?=\\d)", "") ## ----------------------------------------------------------------------------- x[, ] <- stri_replace_all_regex(x, "(\\d+(?:\\.\\d+)?)\\(\\d+(?:\\.\\d+)?\\)", "$1") dimnames(x)[[1]] <- stri_replace_all_fixed(dimnames(x)[[1]], c(" (°F)", " (inches)"), c("", ""), vectorise_all = FALSE) ## ----------------------------------------------------------------------------- x <- structure(as.numeric(x), dim = dim(x), dimnames = dimnames(x)) x[, c(1, 6, ncol(x))] # example columns ## ----------------------------------------------------------------------------- x["Record high °C", -ncol(x)] - x["Record low °C", -ncol(x)] ## ----------------------------------------------------------------------------- sum(x["Average rainfall mm", -ncol(x)]) / 365.25 ## ***************************************************************************** ## *********************************** Section 3 ******************************* ## ***************************************************************************** ## ----------------------------------------------------------------------------- "spam" # or 'spam' ## ----------------------------------------------------------------------------- typeof("spam") # object type; see also is.character() and is.vector() length("spam") # how many strings are in this vector? ## ----------------------------------------------------------------------------- pythons <- c("Graham Chapman", "John Cleese", "Terry Gilliam", "Eric Idle", "Terry Jones", "Michael Palin") ## ----------------------------------------------------------------------------- (pythons <- stri_split_fixed(pythons, " ", simplify = TRUE)) ## ----------------------------------------------------------------------------- c(10, -1) * c(1, 2, 3, 4) # == c(10, -1, 10, -1) * c(1, 2, 3, 4) ## ----------------------------------------------------------------------------- stri_count_fixed(c("abcd", "abcabc", "abdc", "dab", NA), "abc") ## ----------------------------------------------------------------------------- stri_count_fixed("abcdeabc", c("def", "bc", "abc", NA)) ## ----------------------------------------------------------------------------- stri_count_fixed(c("abca", "def", "ghi"), c("a", "z", "h")) ## ----------------------------------------------------------------------------- (haystack <- matrix( # example input do.call(stri_join, expand.grid( c("a", "b", "c"), c("a", "b", "c"), c("a", "b", "c") )), nrow = 3)) needle <- c("a", "b", "c") matrix(stri_count_fixed(haystack, needle), # call to stringi nrow = 3, dimnames = list(needle, NULL)) ## ----------------------------------------------------------------------------- haystack <- c("aaa", "bbb", "ccc", "abc", "cba", "aab", "bab", "acc") needle <- c("a", "b", "c") structure( outer(haystack, needle, stri_count_fixed), dimnames = list(haystack, needle)) # add row and column names ## ----------------------------------------------------------------------------- matrix( stri_count_fixed(rep(haystack, each = length(needle)), needle), byrow = TRUE, ncol = length(needle), dimnames = list(haystack, needle)) ## ----------------------------------------------------------------------------- paste(c(NA_character_, "b", "c"), "x", 1:2) # base R stri_join(c(NA_character_, "b", "c"), "x", 1:2) # stringi ## ----------------------------------------------------------------------------- haystack <- c("bacon", "spam", "jam, spam, bacon, and spam") stri_extract_first_regex(haystack, "\\b\\w{1,4}\\b") ## ----------------------------------------------------------------------------- stri_extract_all_regex(haystack, "\\b\\w{1,4}\\b", omit_no_match = TRUE) ## ----------------------------------------------------------------------------- stri_sub_all(haystack, stri_locate_all_regex(haystack, "\\b\\w{1,4}\\b", omit_no_match = TRUE)) ## ----------------------------------------------------------------------------- x <- "AEZaezĄĘŻąęż" stri_sub(x, gregexpr("[[:alpha:]]", x, perl = FALSE)[[1]], length = 1) stri_sub(x, gregexpr("[[:alpha:]]", x, perl = TRUE)[[1]], length = 1) stri_sub(x, gregexpr("\\p{L}", x, perl = TRUE)[[1]], length = 1) ## ----------------------------------------------------------------------------- x <- stri_rand_strings(length(LETTERS)*1000, 1000) microbenchmark::microbenchmark( join2 = stri_join(LETTERS, x, sep = "", collapse = ", "), join3 = stri_join(x, LETTERS, x, sep = "", collapse = ", "), r_paste2 = paste(LETTERS, x, sep = "", collapse = ", "), r_paste3 = paste(x, LETTERS, x, sep = "", collapse = ", ") ) ## ----------------------------------------------------------------------------- set.seed(123) x <- stri_rand_strings(100, 100000, "[actg]") y <- "acca" microbenchmark::microbenchmark( fixed = stri_locate_all_fixed(x, y), regex = stri_locate_all_regex(x, y), coll = stri_locate_all_coll(x, y), r_tre = gregexpr(y, x), r_pcre = gregexpr(y, x, perl = TRUE), r_fixed = gregexpr(y, x, fixed = TRUE) ) ## ***************************************************************************** ## *********************************** Section 4 ******************************* ## ***************************************************************************** ## ----------------------------------------------------------------------------- x <- c("spam", "你好", "\u200b\u200b\u200b", NA_character_, "") ## ----------------------------------------------------------------------------- length(x) ## ----------------------------------------------------------------------------- stri_length(x) ## ----------------------------------------------------------------------------- stri_width(x) ## ----------------------------------------------------------------------------- x <- c("tasty", "delicious", "yummy", NA) x %s+% " " %s+% c("spam", "bacon") ## ----------------------------------------------------------------------------- stri_flatten(stri_omit_na(x), collapse = ", ") ## ----------------------------------------------------------------------------- stri_join(c("X", "Y", "Z"), 1:6, "a") # sep = "", collapse = NULL ## ----------------------------------------------------------------------------- stri_join(c("X", "Y", "Z"), 1:6, "a", sep = "_", collapse = ", ") ## ----------------------------------------------------------------------------- stri_join(pythons[, 2], pythons[, 1], sep = ", ") outer(LETTERS[1:3], 1:5, stri_join, sep = ".") # outer product ## ----------------------------------------------------------------------------- stri_dup(letters[1:5], 1:5) # synonym: letters[1:5] %s*% 1:5 ## ----------------------------------------------------------------------------- words <- list(c("spam", "bacon", "sausage", "spam"), c("eggs", "spam")) stri_join_list(words, sep = ", ") # collapse = NULL ## ----------------------------------------------------------------------------- stri_join_list(words, sep = ", ", collapse = ";\n") ## ----------------------------------------------------------------------------- x <- c("spam", "buckwheat", "", NA, "bacon") x[1:3] # from 1st to 3rd string x[c(1, length(x))] # 1st and last ## ----------------------------------------------------------------------------- x[-1] # all but 1st ## ----------------------------------------------------------------------------- x[!stri_isempty(x) & !is.na(x)] ## ----------------------------------------------------------------------------- y <- "spam, egg, spam, spam, bacon, and spam" stri_sub(y, 18) # from 18th code point to end stri_sub(y, 12, to = 15) # from 12th to 15th code point (inclusive) ## ----------------------------------------------------------------------------- stri_sub(y, -15, length = 5) # 5 code points from 15th last ## ----------------------------------------------------------------------------- (z <- stri_sub_all( c("spam", "bacon", "sorghum"), from = list(c(1, 3, 4), -3, c(2, 4)), length = list(1, 3, c(4, 3)))) ## ----------------------------------------------------------------------------- stri_join_list(z, sep = ", ") ## ----------------------------------------------------------------------------- stri_list2matrix(z, by_row = TRUE, fill = "", n_min = 5) ## ----------------------------------------------------------------------------- (from_to <- cbind(from = c(1, 12, 18), to = c(4, 15, 21))) # +optional labels stri_sub(y, from_to) ## ----------------------------------------------------------------------------- (from_to <- matrix(1:8, ncol = 2, byrow = TRUE)) stri_sub(c("abcdefgh", "ijklmnop"), from_to) ## ----------------------------------------------------------------------------- stri_sub_all(c("abcdefgh", "ijklmnop"), from_to) ## ----------------------------------------------------------------------------- set.seed(123) stri_join_list(stri_sub_all("spam", c(4, 3, 2, 3, 1), length = 1)) stri_rand_shuffle("bacon") # random order stri_reverse("spam") # reverse order ## ----------------------------------------------------------------------------- stri_sub_replace(c("abcde", "ABCDE"), from = c(2, 4), length = c(1, 2), replacement = c("X", "uvw")) ## ----------------------------------------------------------------------------- stri_sub_replace_all( c("abcde", "ABCDE"), from = list(c(2, 4), c(0, 3, 6)), length = list( 1, c(0, 2, 0)), replacement = list( "Z", c("uu", "v", "wwww"))) ## ----------------------------------------------------------------------------- y <- "spam, egg, spam, spam, bacon, and spam" stri_sub(y, 7, length = 3) <- "spam" # in-place replacement, egg → spam print(y) # y has changed ## ----------------------------------------------------------------------------- y <- "aa bb cc" stri_sub_all(y, c(1, 4, 7), length = 2) <- c("A", "BB", "CCC") print(y) # y has changed ## ***************************************************************************** ## *********************************** Section 5 ******************************* ## ***************************************************************************** ## ----------------------------------------------------------------------------- "actg" %s===% c("ACTG", "actg", "act", "actga", NA) ## ----------------------------------------------------------------------------- stri_count_fixed("abcabcdefabcabcabdc", "abc") # search pattern is "abc" ## ----------------------------------------------------------------------------- stri_count("abcabcdefabcabcabdc", fixed = "abc") ## ----------------------------------------------------------------------------- c("abcabcdefabcabcabdc", "cba", NA) |> stri_count_fixed("abc") ## ----------------------------------------------------------------------------- stri_count_fixed("ACTGACGacgggACg", "acg", case_insensitive = TRUE) ## ----------------------------------------------------------------------------- stri_count_fixed("acatgacaca", "aca") # overlap = FALSE (default) stri_count_fixed("acatgacaca", "aca", overlap = TRUE) ## ----------------------------------------------------------------------------- x <- c("abc", "abcd", "def", "xyzabc", "uabdc", "dab", NA, "abc") stri_detect_fixed(x, "abc") ## ----------------------------------------------------------------------------- stri_detect_fixed(x, "abc", negate = TRUE, max_count = 2) ## ----------------------------------------------------------------------------- stri_startswith_fixed(x, "abc") # from = 1 - match at start stri_endswith_fixed(x, "abc") # to = -1 - match at end ## ----------------------------------------------------------------------------- stri_subset_fixed(x, "abc", omit_na = TRUE) ## ----------------------------------------------------------------------------- stri_subset_fixed(x, "abc") <- c("*****", "***") # modifies x in-place print(x) # x has changed ## ----------------------------------------------------------------------------- x <- c("aga", "actg", NA, "AGagaGAgaga") stri_locate_first_fixed(x, "aga") stri_locate_last_fixed(x, "aga", get_length = TRUE) ## ----------------------------------------------------------------------------- stri_locate_all_fixed(x, "aga", overlap = TRUE, case_insensitive = TRUE) ## ----------------------------------------------------------------------------- stri_extract_first_fixed(x, "aga", case_insensitive = TRUE) stri_extract_all_fixed(x, "aga", overlap = TRUE, case_insensitive = TRUE, omit_no_match = TRUE) ## ----------------------------------------------------------------------------- x <- c("aga", "actg", NA, "ggAGAGAgaGAca", "agagagaga") stri_replace_all_fixed(x, "aga", "~", case_insensitive = TRUE) ## ----------------------------------------------------------------------------- stri_replace_all_fixed("The quick brown fox jumped over the lazy dog.", c("quick", "brown", "fox", "lazy", "dog"), c("slow", "yellow-ish", "hen", "spamity", "llama")) stri_replace_all_fixed("The quick brown fox jumped over the lazy dog.", c("quick", "brown", "fox", "lazy", "dog"), c("slow", "yellow-ish", "hen", "spamity", "llama"), vectorise_all = FALSE) ## ----------------------------------------------------------------------------- x <- c("a,b,c,d", "e", "", NA, "f,g,,,h,i,,j,") stri_split_fixed(x, ",", omit_empty = TRUE) ## ***************************************************************************** ## *********************************** Section 6 ******************************* ## ***************************************************************************** ## ----------------------------------------------------------------------------- stri_count_regex("spam, eggs, spam, bacon, sausage, and spam", "spam") ## ----------------------------------------------------------------------------- stri_detect_regex("groß", "GROSS", case_insensitive = TRUE) ## ----------------------------------------------------------------------------- stri_count_regex("spam...", "\\.") # "\\" is a way to input a single \ ## ----------------------------------------------------------------------------- x <- "Ham, spam,\njam, SPAM, eggs, and spam" stri_extract_all_regex(x, "..am", case_insensitive = TRUE) ## ----------------------------------------------------------------------------- stri_extract_all_regex(x, "..am", dot_all = TRUE, case_insensitive = TRUE) ## ----------------------------------------------------------------------------- stri_extract_all_regex(x, "[hj]am") ## ----------------------------------------------------------------------------- x <- "Nobody expects the Spanish Inquisition!" stri_extract_all_regex(x, "[^ ][^ ][^ ]") ## ----------------------------------------------------------------------------- stri_extract_all_regex("In 2020, Gągolewski had fun once.", "[0-9A-Za-z]") ## ----------------------------------------------------------------------------- x <- "aąbßÆAĄB你123,.;'! \t-+=[]©←→”„²³¾" p <- c("\\p{L}", "\\p{Ll}", "\\p{Lu}", "\\p{N}", "\\p{P}", "\\p{S}") structure(stri_extract_all_regex(x, p), names = p) ## ----------------------------------------------------------------------------- p <- c("\\w", "\\d", "\\s") structure(stri_extract_all_regex(x, p), names = p) ## ----------------------------------------------------------------------------- x <- ",./|\\<>?;:'\"[]{}-=_+()*&^%$€#@!`~×‒„”" regmatches(x, gregexpr("[[:punct:]]", x, perl = TRUE)) # base R ## ----------------------------------------------------------------------------- stri_extract_all_regex(x, "[[:punct:]]") # equivalently: \p{P} stri_extract_all_regex(x, "\\p{S}") # symbols ## ----------------------------------------------------------------------------- x <- "spam, egg, ham, jam, algae, and an amalgam of spam, all al dente" stri_extract_all_regex(x, "spam|ham") ## ----------------------------------------------------------------------------- stri_extract_all_regex(x, "(?# match 'sp' or 'h')(sp|h)(?# and 'am')am|(?# or match 'egg')egg") ## ----------------------------------------------------------------------------- stri_extract_all_regex(x, stri_join( "(sp|h)", # match either 'sp' or 'h' "am", # followed by 'am' "|", # ... or ... "egg" # just match 'egg' )) ## ----------------------------------------------------------------------------- stri_count_regex("Spam spam SPAMITY spAm", "(?i)spam") ## ----------------------------------------------------------------------------- x <- "sp(AM)(maps)(SP)am" stri_extract_all_regex(x, c("\\(.+\\)", # [[1]] greedy "\\(.+?\\)", # [[2]] lazy "\\([^)]+\\)" # [[3]] greedy (but clever) )) ## ----------------------------------------------------------------------------- stri_extract_first_regex("spamamamnomnomnomammmmmmmmm", c("sp(am|nom)+", "sp(am|nom)+?", "sp(am|nom)+?m*", "sp(am|nom)+?m+")) ## ----------------------------------------------------------------------------- stri_extract_all_regex("12, 34.5, 678.901234, 37...629, ...", c("\\d+\\.\\d+", "\\d+(\\.\\d+)?")) ## ----------------------------------------------------------------------------- system.time(tryCatch({ stri_detect_regex("a" %s*% 1000 %s+% "c", "(a+)+b", time_limit = 1e5) }, error = function(e) cat("stopped."))) ## ----------------------------------------------------------------------------- x <- "name='Sir Launcelot', quest='Seek the Grail', favcolor='blue'" stri_match_all_regex(x, "(\\w+)='(.+?)'") ## ----------------------------------------------------------------------------- stri_match_all_regex(x, "(?:\\w+)='(?.+?)'") ## ----------------------------------------------------------------------------- stri_locate_all_regex(x, "(?\\w+)='(?.+?)'", capture_groups = TRUE, get_length = TRUE) ## ----------------------------------------------------------------------------- stri_replace_all_regex(x, "(\\w+)='(.+?)'", "$2 is a $1") ## ----------------------------------------------------------------------------- stri_replace_all_regex(x, "(?\\w+)='(?.+?)'", "${value} is a ${key}") ## ----------------------------------------------------------------------------- stri_extract_all_regex("spameggs", c("<[a-z]+>.*?", "<([a-z]+)>.*?")) ## ----------------------------------------------------------------------------- x <- c("spam egg", "bacon spam", "spam", "egg spam bacon", "sausage") p <- c("spam", "^spam", "spam$", "spam$|^spam", "^spam$") structure(outer(x, p, stri_detect_regex), dimnames = list(x, p)) ## ----------------------------------------------------------------------------- stri_extract_all_regex("12, 34.5, J23, 37.629cm", "\\b\\d+(\\.\\d+)?+\\b") ## ----------------------------------------------------------------------------- stri_extract_all_regex("I like spam, spam, eggs, and spam.", c("\\w+(?=[,.])", "\\w++(?![,.])")) ## ***************************************************************************** ## *********************************** Section 7 ******************************* ## ***************************************************************************** ## ----------------------------------------------------------------------------- length(stri_locale_list()) ## ----------------------------------------------------------------------------- set.seed(514678) sample(stri_locale_list(), 5) ## ----------------------------------------------------------------------------- ## install.packages("stringi", configure.args = "--disable-pkg-config") ## ----------------------------------------------------------------------------- stri_locale_get() ## ----------------------------------------------------------------------------- "a\u0328" %s==% "ą" # a, ogonek == a with ogonek ## ----------------------------------------------------------------------------- x <- c("Gągolewski", "Gagolewski", "Ga\u0328golewski") stri_unique(x) stri_duplicated(x) # from_last = FALSE ## ----------------------------------------------------------------------------- "chaotic" %s<% "hard" # c < h ## ----------------------------------------------------------------------------- stri_cmp_lt("chłodny", "hardy", locale = "pl_PL") # c < h ## ----------------------------------------------------------------------------- stri_cmp_lt("chladný", "hladný", locale = "sk_SK") # ch > h ## ----------------------------------------------------------------------------- stri_sort(c("chłodny", "hardy", "cichy", "cenny"), locale = "pl_PL") stri_sort(c("cudný", "chladný", "hladný", "čudný"), locale = "sk_SK") ## ----------------------------------------------------------------------------- german_k_words <- c("können", "kondensieren", "kochen", "korrelieren") stri_sort(german_k_words, locale = "de_DE") stri_sort(german_k_words, locale = "de_DE@collation=phonebook") ## ----------------------------------------------------------------------------- stri_cmp_equiv("\ufb00", "ff", strength = 2) ## ----------------------------------------------------------------------------- x <- c("gro\u00df", "gross", "GROSS", "Gro\u00df", "Gross", "GRO\u1e9e") stri_unique(x, strength = 1) # ß == ss, case insensitive stri_unique(x, strength = 2) # ß != ss, case insensitive ## ----------------------------------------------------------------------------- stri_unique(x, strength = 1, case_level = TRUE) # ß == ss, case sensitive ## ----------------------------------------------------------------------------- x <- c("code point", "code-point", "codepoint", "CODE POINT", "CodePoint") stri_unique(x, alternate_shifted = TRUE) stri_unique(x, alternate_shifted = TRUE, strength = 2) ## ----------------------------------------------------------------------------- stri_unique(x, strength = 2) ## ----------------------------------------------------------------------------- stri_sort(c("cote", "côte", "coté", "côté"), locale = "fr_FR") stri_sort(c("cote", "côte", "coté", "côté"), locale = "fr_CA") # french = TRUE ## ----------------------------------------------------------------------------- x <- c("a1", "a2", "a11", "a1", "a99", "a10", "a100", "a2", "a9", "a2") stri_sort(x) ## ----------------------------------------------------------------------------- stri_sort(x, numeric = TRUE) ## ----------------------------------------------------------------------------- set.seed(123) X <- data.frame(a = x, b = runif(length(x))) X[order(-stri_rank(X$a, numeric = TRUE), X$b), ] ## ----------------------------------------------------------------------------- stri_detect_coll("Er ist so groß.", "GROSS", strength = 1, locale = "de_AT") stri_detect_coll("On je chladný", "chladny", strength = 1, locale = "sk_SK") ## ***************************************************************************** ## *********************************** Section 8 ******************************* ## ***************************************************************************** ## ----------------------------------------------------------------------------- x <- "The\u00a0above-mentioned features are useful. " %s+% "My hovercraft is full of eels, eggs, and spam." ## ----------------------------------------------------------------------------- stri_count_boundaries(x, type = "sentence") ## ----------------------------------------------------------------------------- stri_extract_all_words(x) ## ----------------------------------------------------------------------------- cat(stri_pad("SPAMITY SPAM", width = 77, side = "both", pad = ".")) ## ----------------------------------------------------------------------------- x <- " spam, eggs, and lovely spam.\n" stri_trim(x) # side = "both" ## ----------------------------------------------------------------------------- set.seed(1233) x <- stri_rand_lipsum(1) # random text paragraph cat(stri_wrap(x, width = 74, indent = 8, exdent = 4, prefix = "> "), sep = "\n") ## ----------------------------------------------------------------------------- cat(stri_sprintf("[%6s]", c("abcd", "\u200b\u200b\u200bąß²€")), sep = "\n") ## ----------------------------------------------------------------------------- "value='%.4f'" %s$% pi # equivalently: "value='%.4f'" %s$% list(pi) "%s='%.*3$f'" %s$% list("pi", pi, 1:4) ## ----------------------------------------------------------------------------- set.seed(123) stri_rand_strings(5, 2:6, "[ACTG]") ## ----------------------------------------------------------------------------- stri_rand_strings(1, 8, "[\\p{script=Katakana}&\\p{L}]") ## ----------------------------------------------------------------------------- stri_trans_toupper("groß") stri_trans_tolower("Iİ", locale = "tr_TR") # Turkish stri_trans_totitle("ijsvrij yoghurt", locale = "nl_NL") # Dutch ## ----------------------------------------------------------------------------- stri_trans_char("GATAAATCTGGTCTTATTTCC", "ACGT", "tgca") ## ----------------------------------------------------------------------------- set.seed(12345) sample(stri_trans_list(), 9) # a few random entries ## ----------------------------------------------------------------------------- stri_trans_general("groß© żółć La Niña köszönöm", "upper; latin-ascii") ## ----------------------------------------------------------------------------- x <- "1 maja 2021 r., godz. 17:17:32" ## ----------------------------------------------------------------------------- stri_datetime_parse(x, "dd MMMM yyyy 'r., godz.' HH:mm:ss", locale = "pl_PL", tz = "Europe/Warsaw") ## ----------------------------------------------------------------------------- stri_datetime_parse(x, stri_datetime_fstr("%d %B %Y r., godz. %H:%M:%S"), locale = "pl_PL", tz = "Europe/Warsaw") ## ----------------------------------------------------------------------------- stri_datetime_format( stri_datetime_add(stri_datetime_now(), 1, "day"), # add 1 day to 'now' "datetime_relative_long", # full format, relative to 'now' locale = "en_NZ", tz = "NZ") ## ----------------------------------------------------------------------------- stri_datetime_format( stri_datetime_create(2020, 1:12, 1), "date_long", locale = "@calendar=hebrew") stri_datetime_format( stri_datetime_create(2020, c(2, 8), c(4, 7)), "date_full", locale = "ja_JP@calendar=japanese") ## ***************************************************************************** ## *********************************** Section 9 ******************************* ## ***************************************************************************** ## ----------------------------------------------------------------------------- z <- c("\\p{L}", "\\p{Ll}", "\\p{Lu}", "\\p{N}", "\\p{P}", "\\p{S}", "\\w", "\\d", "\\s") structure(stri_count_regex(stri_enc_fromutf32( setdiff(1:0x10ffff, c(0xd800:0xf8ff))), z), names = z) ## ----------------------------------------------------------------------------- "\u007A" # or "\U0000007A" ## ----------------------------------------------------------------------------- x <- "zß你好" stri_escape_unicode(x) ## ----------------------------------------------------------------------------- "\U001F600" # the grinning face emoji, (: - font unavailable ## ----------------------------------------------------------------------------- stri_trans_general("\U001F600", "any-name") # query the character database ## ----------------------------------------------------------------------------- x <- "abz0ąß你好!" stri_encode(x, to = "UTF-8", to_raw = TRUE)[[1]] stri_encode(x, to = "UTF-16LE", to_raw = TRUE)[[1]] # little-endian ## ----------------------------------------------------------------------------- x <- stri_read_lines("ES_latin1.txt", encoding = "ISO-8859-1") head(x, 4) # x is in UTF-8 now ## ----------------------------------------------------------------------------- x <- stri_read_raw("ES_latin1.txt") head(x, 24) # vector of type raw ## ----------------------------------------------------------------------------- stri_enc_isascii(x) stri_enc_isutf8(x) # false positives are possible ## ----------------------------------------------------------------------------- stri_enc_detect(x) # based on heuristics ## ----------------------------------------------------------------------------- y <- stri_encode(x, from = "ISO-8859-1", to = "UTF-8") ## ----------------------------------------------------------------------------- tail(stri_split_lines1(y), 4) # spoiler alert! ## ----------------------------------------------------------------------------- x <- "a\u0328 ą" # a, combining ogonek, space, a with ogonek stri_enc_toutf32( # code points as decimals c(x, stri_trans_nfc(x), stri_trans_nfd(x))) ## ----------------------------------------------------------------------------- stri_trans_nfkd("r²︷")