## Replication materials for all results from the manuscript:
## stringi: Fast and Portable Character String Processing in R
##
## by Marek Gagolewski, 2022-02-28
## https://stringi.gagolewski.com
##
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


## *****************************************************************************
## *********************************** Section 1 *******************************
## *****************************************************************************

## -----------------------------------------------------------------------------
library("stringi")
cat(stri_info(short = TRUE))


## *****************************************************************************
## *********************************** Section 2 *******************************
## *****************************************************************************

## -----------------------------------------------------------------------------
library("xml2")
library("rvest")


## -----------------------------------------------------------------------------
f <- read_html("20200917_wikipedia_melbourne.html")


## -----------------------------------------------------------------------------
all_tables <- html_nodes(f, "table")
text_tables <- sapply(all_tables, html_text)
str(text_tables, nchar.max = 65, vec.len = 5, strict.width = "wrap") # preview


## -----------------------------------------------------------------------------
library("stringi")
(idx <- which(stri_detect_fixed(text_tables, "climate data",
  case_insensitive = TRUE, max_count = 1)))


## -----------------------------------------------------------------------------
(x <- as.data.frame(html_table(all_tables[[idx]], fill = TRUE)))


## -----------------------------------------------------------------------------
x <- as.matrix(x)


## -----------------------------------------------------------------------------
x[, ] <- stri_trans_general(x, "Publishing-Any; Any-ASCII")


## -----------------------------------------------------------------------------
dimnames(x) <- list(x[, 1], x[1, ])  # row, column names
x <- x[2:(nrow(x)-1), 2:ncol(x)]     # skip 1st/last row and 1st column
x[, c(1, ncol(x))]  # example columns


## -----------------------------------------------------------------------------
x[, ] <- stri_replace_all_regex(x, "(?<=\\d),(?=\\d)", "")


## -----------------------------------------------------------------------------
x[, ] <- stri_replace_all_regex(x,
  "(\\d+(?:\\.\\d+)?)\\(\\d+(?:\\.\\d+)?\\)", "$1")
dimnames(x)[[1]] <- stri_replace_all_fixed(dimnames(x)[[1]],
  c(" (°F)", " (inches)"), c("", ""), vectorise_all = FALSE)


## -----------------------------------------------------------------------------
x <- structure(as.numeric(x), dim = dim(x), dimnames = dimnames(x))
x[, c(1, 6, ncol(x))]  # example columns


## -----------------------------------------------------------------------------
x["Record high °C", -ncol(x)] - x["Record low °C", -ncol(x)]


## -----------------------------------------------------------------------------
sum(x["Average rainfall mm", -ncol(x)]) / 365.25


## *****************************************************************************
## *********************************** Section 3 *******************************
## *****************************************************************************

## -----------------------------------------------------------------------------
"spam"  # or 'spam'


## -----------------------------------------------------------------------------
typeof("spam")  # object type; see also is.character() and is.vector()
length("spam")  # how many strings are in this vector?


## -----------------------------------------------------------------------------
pythons <- c("Graham Chapman", "John Cleese", "Terry Gilliam",
  "Eric Idle", "Terry Jones", "Michael Palin")


## -----------------------------------------------------------------------------
(pythons <- stri_split_fixed(pythons, " ", simplify = TRUE))


## -----------------------------------------------------------------------------
c(10, -1) * c(1, 2, 3, 4)  # == c(10, -1, 10, -1) * c(1, 2, 3, 4)


## -----------------------------------------------------------------------------
stri_count_fixed(c("abcd", "abcabc", "abdc", "dab", NA), "abc")


## -----------------------------------------------------------------------------
stri_count_fixed("abcdeabc", c("def", "bc", "abc", NA))


## -----------------------------------------------------------------------------
stri_count_fixed(c("abca", "def", "ghi"), c("a", "z", "h"))


## -----------------------------------------------------------------------------
(haystack <- matrix(  # example input
  do.call(stri_join,
    expand.grid(
      c("a", "b", "c"), c("a", "b", "c"), c("a", "b", "c")
    )), nrow = 3))
needle <- c("a", "b", "c")
matrix(stri_count_fixed(haystack, needle),  # call to stringi
  nrow = 3, dimnames = list(needle, NULL))


## -----------------------------------------------------------------------------
haystack <- c("aaa", "bbb", "ccc", "abc", "cba", "aab", "bab", "acc")
needle <- c("a", "b", "c")
structure(
  outer(haystack, needle, stri_count_fixed),
  dimnames = list(haystack, needle))  # add row and column names


## -----------------------------------------------------------------------------
matrix(
  stri_count_fixed(rep(haystack, each = length(needle)), needle),
  byrow = TRUE, ncol = length(needle), dimnames = list(haystack, needle))


## -----------------------------------------------------------------------------
paste(c(NA_character_, "b", "c"), "x", 1:2)  # base R
stri_join(c(NA_character_, "b", "c"), "x", 1:2)  # stringi


## -----------------------------------------------------------------------------
haystack <- c("bacon", "spam", "jam, spam, bacon, and spam")
stri_extract_first_regex(haystack, "\\b\\w{1,4}\\b")


## -----------------------------------------------------------------------------
stri_extract_all_regex(haystack, "\\b\\w{1,4}\\b", omit_no_match = TRUE)


## -----------------------------------------------------------------------------
stri_sub_all(haystack,
  stri_locate_all_regex(haystack, "\\b\\w{1,4}\\b", omit_no_match = TRUE))


## -----------------------------------------------------------------------------
x <- "AEZaezĄĘŻąęż"
stri_sub(x, gregexpr("[[:alpha:]]", x, perl = FALSE)[[1]], length = 1)
stri_sub(x, gregexpr("[[:alpha:]]", x, perl = TRUE)[[1]], length = 1)
stri_sub(x, gregexpr("\\p{L}", x, perl = TRUE)[[1]], length = 1)


## -----------------------------------------------------------------------------
x <- stri_rand_strings(length(LETTERS)*1000, 1000)
microbenchmark::microbenchmark(
  join2 = stri_join(LETTERS, x, sep = "", collapse = ", "),
  join3 = stri_join(x, LETTERS, x, sep = "", collapse = ", "),
  r_paste2 = paste(LETTERS, x, sep = "", collapse = ", "),
  r_paste3 = paste(x, LETTERS, x, sep = "", collapse = ", ")
)


## -----------------------------------------------------------------------------
set.seed(123)
x <- stri_rand_strings(100, 100000, "[actg]")
y <- "acca"
microbenchmark::microbenchmark(
  fixed = stri_locate_all_fixed(x, y),
  regex = stri_locate_all_regex(x, y),
  coll = stri_locate_all_coll(x, y),
  r_tre = gregexpr(y, x),
  r_pcre = gregexpr(y, x, perl = TRUE),
  r_fixed = gregexpr(y, x, fixed = TRUE)
)


## *****************************************************************************
## *********************************** Section 4 *******************************
## *****************************************************************************

## -----------------------------------------------------------------------------
x <- c("spam", "你好", "\u200b\u200b\u200b", NA_character_, "")


## -----------------------------------------------------------------------------
length(x)


## -----------------------------------------------------------------------------
stri_length(x)


## -----------------------------------------------------------------------------
stri_width(x)


## -----------------------------------------------------------------------------
x <- c("tasty", "delicious", "yummy", NA)
x %s+% " " %s+% c("spam", "bacon")


## -----------------------------------------------------------------------------
stri_flatten(stri_omit_na(x), collapse = ", ")


## -----------------------------------------------------------------------------
stri_join(c("X", "Y", "Z"), 1:6, "a")  # sep = "", collapse = NULL


## -----------------------------------------------------------------------------
stri_join(c("X", "Y", "Z"), 1:6, "a", sep = "_", collapse = ", ")


## -----------------------------------------------------------------------------
stri_join(pythons[, 2], pythons[, 1], sep = ", ")
outer(LETTERS[1:3], 1:5, stri_join, sep = ".")  # outer product


## -----------------------------------------------------------------------------
stri_dup(letters[1:5], 1:5)  # synonym: letters[1:5] %s*% 1:5


## -----------------------------------------------------------------------------
words <- list(c("spam", "bacon", "sausage", "spam"), c("eggs", "spam"))
stri_join_list(words, sep = ", ")  # collapse = NULL


## -----------------------------------------------------------------------------
stri_join_list(words, sep = ", ", collapse = ";\n")


## -----------------------------------------------------------------------------
x <- c("spam", "buckwheat", "", NA, "bacon")
x[1:3]                           # from 1st to 3rd string
x[c(1, length(x))]               # 1st and last


## -----------------------------------------------------------------------------
x[-1]                            # all but 1st


## -----------------------------------------------------------------------------
x[!stri_isempty(x) & !is.na(x)]


## -----------------------------------------------------------------------------
y <- "spam, egg, spam, spam, bacon, and spam"
stri_sub(y, 18)             # from 18th code point to end
stri_sub(y, 12, to = 15)      # from 12th to 15th code point (inclusive)


## -----------------------------------------------------------------------------
stri_sub(y, -15, length = 5)  # 5 code points from 15th last


## -----------------------------------------------------------------------------
(z <- stri_sub_all(
              c("spam",     "bacon", "sorghum"),
  from   = list(c(1, 3, 4), -3,      c(2, 4)),
  length = list(1,           3,      c(4, 3))))


## -----------------------------------------------------------------------------
stri_join_list(z, sep = ", ")


## -----------------------------------------------------------------------------
stri_list2matrix(z, by_row = TRUE, fill = "", n_min = 5)


## -----------------------------------------------------------------------------
(from_to <- cbind(from = c(1, 12, 18), to = c(4, 15, 21))) # +optional labels
stri_sub(y, from_to)


## -----------------------------------------------------------------------------
(from_to <- matrix(1:8, ncol = 2, byrow = TRUE))
stri_sub(c("abcdefgh", "ijklmnop"), from_to)


## -----------------------------------------------------------------------------
stri_sub_all(c("abcdefgh", "ijklmnop"), from_to)


## -----------------------------------------------------------------------------
set.seed(123)
stri_join_list(stri_sub_all("spam", c(4, 3, 2, 3, 1), length = 1))
stri_rand_shuffle("bacon")  # random order
stri_reverse("spam")        # reverse order


## -----------------------------------------------------------------------------
stri_sub_replace(c("abcde", "ABCDE"),
  from = c(2, 4), length = c(1, 2), replacement = c("X", "uvw"))


## -----------------------------------------------------------------------------
stri_sub_replace_all(
                   c("abcde",  "ABCDE"),
  from        = list(c(2, 4),  c(0,    3,   6)),
  length      = list(  1,      c(0,    2,   0)),
  replacement = list(  "Z",    c("uu", "v", "wwww")))


## -----------------------------------------------------------------------------
y <- "spam, egg, spam, spam, bacon, and spam"
stri_sub(y, 7, length = 3) <- "spam"  # in-place replacement, egg → spam
print(y)                            # y has changed


## -----------------------------------------------------------------------------
y <- "aa bb cc"
stri_sub_all(y, c(1, 4, 7), length = 2) <- c("A", "BB", "CCC")
print(y)                            # y has changed


## *****************************************************************************
## *********************************** Section 5 *******************************
## *****************************************************************************


## -----------------------------------------------------------------------------
"actg" %s===% c("ACTG", "actg", "act", "actga", NA)


## -----------------------------------------------------------------------------
stri_count_fixed("abcabcdefabcabcabdc", "abc")  # search pattern is "abc"


## -----------------------------------------------------------------------------
stri_count("abcabcdefabcabcabdc", fixed = "abc")


## -----------------------------------------------------------------------------
c("abcabcdefabcabcabdc", "cba", NA) |> stri_count_fixed("abc")


## -----------------------------------------------------------------------------
stri_count_fixed("ACTGACGacgggACg", "acg", case_insensitive = TRUE)


## -----------------------------------------------------------------------------
stri_count_fixed("acatgacaca", "aca")  # overlap = FALSE (default)
stri_count_fixed("acatgacaca", "aca", overlap = TRUE)


## -----------------------------------------------------------------------------
x <- c("abc", "abcd", "def", "xyzabc", "uabdc", "dab", NA, "abc")
stri_detect_fixed(x, "abc")


## -----------------------------------------------------------------------------
stri_detect_fixed(x, "abc", negate = TRUE, max_count = 2)


## -----------------------------------------------------------------------------
stri_startswith_fixed(x, "abc")  # from = 1 - match at start
stri_endswith_fixed(x, "abc")    # to = -1 - match at end


## -----------------------------------------------------------------------------
stri_subset_fixed(x, "abc", omit_na = TRUE)


## -----------------------------------------------------------------------------
stri_subset_fixed(x, "abc") <- c("*****", "***")  # modifies x in-place
print(x)  # x has changed


## -----------------------------------------------------------------------------
x <- c("aga", "actg", NA, "AGagaGAgaga")
stri_locate_first_fixed(x, "aga")
stri_locate_last_fixed(x, "aga", get_length = TRUE)


## -----------------------------------------------------------------------------
stri_locate_all_fixed(x, "aga", overlap = TRUE, case_insensitive = TRUE)


## -----------------------------------------------------------------------------
stri_extract_first_fixed(x, "aga", case_insensitive = TRUE)
stri_extract_all_fixed(x, "aga",
  overlap = TRUE, case_insensitive = TRUE, omit_no_match = TRUE)


## -----------------------------------------------------------------------------
x <- c("aga", "actg", NA, "ggAGAGAgaGAca", "agagagaga")
stri_replace_all_fixed(x, "aga", "~", case_insensitive = TRUE)


## -----------------------------------------------------------------------------
stri_replace_all_fixed("The quick brown fox jumped over the lazy dog.",
  c("quick", "brown",      "fox", "lazy",    "dog"),
  c("slow",  "yellow-ish", "hen", "spamity", "llama"))
stri_replace_all_fixed("The quick brown fox jumped over the lazy dog.",
  c("quick", "brown",      "fox", "lazy", "dog"),
  c("slow",  "yellow-ish", "hen", "spamity", "llama"),
  vectorise_all = FALSE)


## -----------------------------------------------------------------------------
x <- c("a,b,c,d", "e", "", NA, "f,g,,,h,i,,j,")
stri_split_fixed(x, ",", omit_empty = TRUE)


## *****************************************************************************
## *********************************** Section 6 *******************************
## *****************************************************************************

## -----------------------------------------------------------------------------
stri_count_regex("spam, eggs, spam, bacon, sausage, and spam", "spam")


## -----------------------------------------------------------------------------
stri_detect_regex("groß", "GROSS", case_insensitive = TRUE)


## -----------------------------------------------------------------------------
stri_count_regex("spam...", "\\.")   # "\\" is a way to input a single \


## -----------------------------------------------------------------------------
x <- "Ham, spam,\njam, SPAM, eggs, and spam"
stri_extract_all_regex(x, "..am", case_insensitive = TRUE)


## -----------------------------------------------------------------------------
stri_extract_all_regex(x, "..am", dot_all = TRUE, case_insensitive = TRUE)


## -----------------------------------------------------------------------------
stri_extract_all_regex(x, "[hj]am")


## -----------------------------------------------------------------------------
x <- "Nobody expects the Spanish Inquisition!"
stri_extract_all_regex(x, "[^ ][^ ][^ ]")


## -----------------------------------------------------------------------------
stri_extract_all_regex("In 2020, Gągolewski had fun once.", "[0-9A-Za-z]")


## -----------------------------------------------------------------------------
x <- "aąbßÆAĄB你123,.;'! \t-+=[]©←→”„²³¾"
p <- c("\\p{L}", "\\p{Ll}", "\\p{Lu}", "\\p{N}", "\\p{P}", "\\p{S}")
structure(stri_extract_all_regex(x, p), names = p)


## -----------------------------------------------------------------------------
p <- c("\\w", "\\d", "\\s")
structure(stri_extract_all_regex(x, p), names = p)


## -----------------------------------------------------------------------------
x <- ",./|\\<>?;:'\"[]{}-=_+()*&^%$€#@!`~×‒„”"
regmatches(x, gregexpr("[[:punct:]]",  x, perl = TRUE))  # base R


## -----------------------------------------------------------------------------
stri_extract_all_regex(x, "[[:punct:]]")    # equivalently: \p{P}
stri_extract_all_regex(x, "\\p{S}")         # symbols


## -----------------------------------------------------------------------------
x <- "spam, egg, ham, jam, algae, and an amalgam of spam, all al dente"
stri_extract_all_regex(x, "spam|ham")


## -----------------------------------------------------------------------------
stri_extract_all_regex(x,
  "(?# match 'sp' or 'h')(sp|h)(?# and 'am')am|(?# or match 'egg')egg")


## -----------------------------------------------------------------------------
stri_extract_all_regex(x,
  stri_join(
      "(sp|h)",   # match either 'sp' or 'h'
      "am",       # followed by 'am'
    "|",            # ... or ...
      "egg"       # just match 'egg'
))


## -----------------------------------------------------------------------------
stri_count_regex("Spam spam SPAMITY spAm", "(?i)spam")


## -----------------------------------------------------------------------------
x <- "sp(AM)(maps)(SP)am"
stri_extract_all_regex(x,
  c("\\(.+\\)",    # [[1]] greedy
    "\\(.+?\\)",   # [[2]] lazy
    "\\([^)]+\\)"  # [[3]] greedy (but clever)
))


## -----------------------------------------------------------------------------
stri_extract_first_regex("spamamamnomnomnomammmmmmmmm",
  c("sp(am|nom)+",             "sp(am|nom)+?",
    "sp(am|nom)+?m*",          "sp(am|nom)+?m+"))


## -----------------------------------------------------------------------------
stri_extract_all_regex("12, 34.5, 678.901234, 37...629, ...",
  c("\\d+\\.\\d+", "\\d+(\\.\\d+)?"))


## -----------------------------------------------------------------------------
system.time(tryCatch({
  stri_detect_regex("a" %s*% 1000 %s+% "c", "(a+)+b", time_limit = 1e5)
}, error = function(e) cat("stopped.")))


## -----------------------------------------------------------------------------
x <- "name='Sir Launcelot', quest='Seek the Grail', favcolor='blue'"
stri_match_all_regex(x, "(\\w+)='(.+?)'")


## -----------------------------------------------------------------------------
stri_match_all_regex(x, "(?:\\w+)='(?<value>.+?)'")


## -----------------------------------------------------------------------------
stri_locate_all_regex(x, "(?<key>\\w+)='(?<value>.+?)'",
  capture_groups = TRUE, get_length = TRUE)


## -----------------------------------------------------------------------------
stri_replace_all_regex(x, "(\\w+)='(.+?)'", "$2 is a $1")


## -----------------------------------------------------------------------------
stri_replace_all_regex(x, "(?<key>\\w+)='(?<value>.+?)'",
  "${value} is a ${key}")


## -----------------------------------------------------------------------------
stri_extract_all_regex("<strong><em>spam</em></strong><code>eggs</code>",
  c("<[a-z]+>.*?</[a-z]+>", "<([a-z]+)>.*?</\\1>"))


## -----------------------------------------------------------------------------
x <- c("spam egg", "bacon spam", "spam", "egg spam bacon", "sausage")
p <- c("spam", "^spam", "spam$", "spam$|^spam", "^spam$")
structure(outer(x, p, stri_detect_regex), dimnames = list(x, p))


## -----------------------------------------------------------------------------
stri_extract_all_regex("12, 34.5, J23, 37.629cm", "\\b\\d+(\\.\\d+)?+\\b")


## -----------------------------------------------------------------------------
stri_extract_all_regex("I like spam, spam, eggs, and spam.",
  c("\\w+(?=[,.])", "\\w++(?![,.])"))


## *****************************************************************************
## *********************************** Section 7 *******************************
## *****************************************************************************

## -----------------------------------------------------------------------------
length(stri_locale_list())


## -----------------------------------------------------------------------------
set.seed(514678)
sample(stri_locale_list(), 5)


## -----------------------------------------------------------------------------
## install.packages("stringi", configure.args = "--disable-pkg-config")


## -----------------------------------------------------------------------------
stri_locale_get()


## -----------------------------------------------------------------------------
"a\u0328" %s==% "ą"             # a, ogonek == a with ogonek


## -----------------------------------------------------------------------------
x <- c("Gągolewski", "Gagolewski", "Ga\u0328golewski")
stri_unique(x)
stri_duplicated(x)  # from_last = FALSE


## -----------------------------------------------------------------------------
"chaotic" %s<% "hard"  # c < h


## -----------------------------------------------------------------------------
stri_cmp_lt("chłodny", "hardy", locale = "pl_PL")  # c < h


## -----------------------------------------------------------------------------
stri_cmp_lt("chladný", "hladný", locale = "sk_SK") # ch > h


## -----------------------------------------------------------------------------
stri_sort(c("chłodny", "hardy", "cichy", "cenny"), locale = "pl_PL")
stri_sort(c("cudný", "chladný", "hladný", "čudný"), locale = "sk_SK")


## -----------------------------------------------------------------------------
german_k_words <- c("können", "kondensieren", "kochen", "korrelieren")
stri_sort(german_k_words, locale = "de_DE")
stri_sort(german_k_words, locale = "de_DE@collation=phonebook")


## -----------------------------------------------------------------------------
stri_cmp_equiv("\ufb00", "ff", strength = 2)


## -----------------------------------------------------------------------------
x <- c("gro\u00df", "gross", "GROSS", "Gro\u00df", "Gross", "GRO\u1e9e")
stri_unique(x, strength = 1)                  # ß == ss, case insensitive
stri_unique(x, strength = 2)                  # ß != ss, case insensitive


## -----------------------------------------------------------------------------
stri_unique(x, strength = 1, case_level = TRUE) # ß == ss, case sensitive


## -----------------------------------------------------------------------------
x <- c("code point", "code-point", "codepoint", "CODE POINT", "CodePoint")
stri_unique(x, alternate_shifted = TRUE)
stri_unique(x, alternate_shifted = TRUE, strength = 2)


## -----------------------------------------------------------------------------
stri_unique(x, strength = 2)


## -----------------------------------------------------------------------------
stri_sort(c("cote", "côte", "coté", "côté"), locale = "fr_FR")
stri_sort(c("cote", "côte", "coté", "côté"), locale = "fr_CA") # french = TRUE


## -----------------------------------------------------------------------------
x <- c("a1", "a2", "a11", "a1", "a99", "a10", "a100", "a2", "a9", "a2")
stri_sort(x)


## -----------------------------------------------------------------------------
stri_sort(x, numeric = TRUE)


## -----------------------------------------------------------------------------
set.seed(123)
X <- data.frame(a = x, b = runif(length(x)))
X[order(-stri_rank(X$a, numeric = TRUE), X$b), ]


## -----------------------------------------------------------------------------
stri_detect_coll("Er ist so groß.", "GROSS", strength = 1, locale = "de_AT")
stri_detect_coll("On je chladný", "chladny", strength = 1, locale = "sk_SK")


## *****************************************************************************
## *********************************** Section 8 *******************************
## *****************************************************************************

## -----------------------------------------------------------------------------
x <- "The\u00a0above-mentioned    features are useful. " %s+%
  "My hovercraft is full of eels, eggs, and spam."


## -----------------------------------------------------------------------------
stri_count_boundaries(x, type = "sentence")


## -----------------------------------------------------------------------------
stri_extract_all_words(x)


## -----------------------------------------------------------------------------
cat(stri_pad("SPAMITY SPAM", width = 77, side = "both", pad = "."))


## -----------------------------------------------------------------------------
x <- "      spam, eggs, and lovely spam.\n"
stri_trim(x)  # side = "both"


## -----------------------------------------------------------------------------
set.seed(1233)
x <- stri_rand_lipsum(1)  # random text paragraph
cat(stri_wrap(x, width = 74, indent = 8, exdent = 4, prefix = "> "), sep = "\n")


## -----------------------------------------------------------------------------
cat(stri_sprintf("[%6s]", c("abcd", "\u200b\u200b\u200bąß²€")), sep = "\n")


## -----------------------------------------------------------------------------
"value='%.4f'" %s$% pi  # equivalently: "value='%.4f'" %s$% list(pi)
"%s='%.*3$f'" %s$% list("pi", pi, 1:4)


## -----------------------------------------------------------------------------
set.seed(123)
stri_rand_strings(5, 2:6, "[ACTG]")


## -----------------------------------------------------------------------------
stri_rand_strings(1, 8, "[\\p{script=Katakana}&\\p{L}]")


## -----------------------------------------------------------------------------
stri_trans_toupper("groß")
stri_trans_tolower("Iİ", locale = "tr_TR")               # Turkish
stri_trans_totitle("ijsvrij yoghurt", locale = "nl_NL")  # Dutch


## -----------------------------------------------------------------------------
stri_trans_char("GATAAATCTGGTCTTATTTCC", "ACGT", "tgca")


## -----------------------------------------------------------------------------
set.seed(12345)
sample(stri_trans_list(), 9)  # a few random entries


## -----------------------------------------------------------------------------
stri_trans_general("groß© żółć La Niña köszönöm", "upper; latin-ascii")


## -----------------------------------------------------------------------------
x <- "1 maja 2021 r., godz. 17:17:32"


## -----------------------------------------------------------------------------
stri_datetime_parse(x, "dd MMMM yyyy 'r., godz.' HH:mm:ss",
  locale = "pl_PL", tz = "Europe/Warsaw")


## -----------------------------------------------------------------------------
stri_datetime_parse(x,
  stri_datetime_fstr("%d %B %Y r., godz. %H:%M:%S"),
  locale = "pl_PL", tz = "Europe/Warsaw")


## -----------------------------------------------------------------------------
stri_datetime_format(
  stri_datetime_add(stri_datetime_now(), 1, "day"), # add 1 day to 'now'
  "datetime_relative_long",              # full format, relative to 'now'
  locale = "en_NZ", tz = "NZ")


## -----------------------------------------------------------------------------
stri_datetime_format(
  stri_datetime_create(2020, 1:12, 1),
  "date_long",
  locale = "@calendar=hebrew")
stri_datetime_format(
  stri_datetime_create(2020, c(2, 8), c(4, 7)),
  "date_full",
  locale = "ja_JP@calendar=japanese")


## *****************************************************************************
## *********************************** Section 9 *******************************
## *****************************************************************************

## -----------------------------------------------------------------------------
z <- c("\\p{L}", "\\p{Ll}", "\\p{Lu}", "\\p{N}", "\\p{P}", "\\p{S}",
  "\\w", "\\d", "\\s")
structure(stri_count_regex(stri_enc_fromutf32(
  setdiff(1:0x10ffff, c(0xd800:0xf8ff))), z), names = z)


## -----------------------------------------------------------------------------
"\u007A"  # or "\U0000007A"


## -----------------------------------------------------------------------------
x <- "zß你好"
stri_escape_unicode(x)


## -----------------------------------------------------------------------------
"\U001F600" # the grinning face emoji, (:               - font unavailable


## -----------------------------------------------------------------------------
stri_trans_general("\U001F600", "any-name") # query the character database


## -----------------------------------------------------------------------------
x <- "abz0ąß你好!"
stri_encode(x, to = "UTF-8", to_raw = TRUE)[[1]]
stri_encode(x, to = "UTF-16LE", to_raw = TRUE)[[1]]  # little-endian


## -----------------------------------------------------------------------------
x <- stri_read_lines("ES_latin1.txt", encoding = "ISO-8859-1")
head(x, 4)  # x is in UTF-8 now


## -----------------------------------------------------------------------------
x <- stri_read_raw("ES_latin1.txt")
head(x, 24)  # vector of type raw


## -----------------------------------------------------------------------------
stri_enc_isascii(x)
stri_enc_isutf8(x)   # false positives are possible


## -----------------------------------------------------------------------------
stri_enc_detect(x)  # based on heuristics


## -----------------------------------------------------------------------------
y <- stri_encode(x, from = "ISO-8859-1", to = "UTF-8")


## -----------------------------------------------------------------------------
tail(stri_split_lines1(y), 4)  # spoiler alert!


## -----------------------------------------------------------------------------
x <- "a\u0328 ą"   # a, combining ogonek, space, a with ogonek
stri_enc_toutf32(  # code points as decimals
  c(x, stri_trans_nfc(x), stri_trans_nfd(x)))


## -----------------------------------------------------------------------------
stri_trans_nfkd("r²︷")