1. case用法:
str_to_upper(string, locale = "")
str_to_lower(string, locale = "")
str_to_title(string, locale = "")
string为要处理的字符串;locale为要翻译的...。
> dog <- "The quick brown dog" > str_to_upper(dog) [1] "THE QUICK BROWN DOG" > str_to_lower(dog) [1] "the quick brown dog" > str_to_title(dog) [1] "The Quick Brown Dog" > str_to_upper("i", "en") # english [1] "I" > str_to_upper("i", "tr") # Turkish [1] "İ"
2. str_c的用法: str_c(..., sep = "", collapse = NULL)
... 为一组字符串向量;sep为插入字符串向量的字符串;collapse为把输入的字符串合并为单个字符串(默认没有) > str_c("Letter", letters, sep = ": ") [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e" [6] "Letter: f" "Letter: g" "Letter: h" "Letter: i" "Letter: j" [11] "Letter: k" "Letter: l" "Letter: m" "Letter: n" "Letter: o" [16] "Letter: p" "Letter: q" "Letter: r" "Letter: s" "Letter: t" [21] "Letter: u" "Letter: v" "Letter: w" "Letter: x" "Letter: y" [26] "Letter: z"
> str_c(letters, collapse = "") [1] "abcdefghijklmnopqrstuvwxyz" > str_c(letters, collapse = ",") [1] "a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z" > str_c(c("a", NA, "B"), "-d") [1] "a-d" NA "B-d" > str_c(str_replace_na(c("a", NA, "b")), "-d")
[1] "a-d" "NA-d" "b-d"
3. str_count的用法:
str_count(string, pattern = "")
string为字符串;pattern为寻找模式。
> fruit <- c("apple", "banana", "pear", "pineapple") > str_count(fruit, "a") [1] 1 3 1 1 > str_count(fruit, "p") [1] 2 0 1 3 > str_count(fruit, c("a","b","p","p")) # 对应每一个查找 [1] 1 1 1 3 > str_count(c("a.", "...", ".a.a"), ".") # 此处. 为正则表达式 [1] 2 3 4 > str_count(c("a.", "...", ".a.a"), fixed(".")) #fixed(".")为只查找.号,也可用"\\." [1] 1 3 2
4. str_detect的用法:
str_detect(string, pattern)
string与pattern如3.
> str_detect(fruit, "a") # 检测是否有a [1] TRUE TRUE TRUE TRUE > str_detect(fruit, "^a") # 检测字符串是否以a开头 [1] TRUE FALSE FALSE FALSE > str_detect(fruit, "a$") # 检测字符串是否以a结尾 [1] FALSE TRUE FALSE FALSE
5. str_extract/str_extract_all的用法:
str_extract(string, pattern)
str_extract_all(string, pattern, simplify = FALSE)
string,pattern如上;simplify:FALSE为返回字符串向量,TRUE为返回字符串矩阵。
> shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2") > str_extract(shopping_list, "\\d") # \\d+ 更好一些 [1] "4" NA NA "2"
> str_extract(shopping_list, "[a-z]+") [1] "apples" "bag" "bag" "milk"
> str_extract(shopping_list, "[a-z]{1,4}") [1] "appl" "bag" "bag" "milk" > str_extract(shopping_list, "\\b[a-z]{1,4}\\b") # \\b 为边界 [1] NA "bag" "bag" "milk" > str_extract_all(shopping_list, "[a-z]+") #由此看出str_extract与str_extract_all的不同 [[1]] [1] "apples" "x" [[2]] [1] "bag" "of" "flour" [[3]] [1] "bag" "of" "sugar" [[4]] [1] "milk" "x" > str_extract_all(shopping_list, "\\b[a-z]+\\b") [[1]] [1] "apples" [[2]] [1] "bag" "of" "flour" [[3]] [1] "bag" "of" "sugar" [[4]]
> str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE) # 生成字符串矩阵 [,1] [,2] [,3] [1,] "apples" "" "" [2,] "bag" "of" "flour" [3,] "bag" "of" "sugar" [4,] "milk" "" "" > str_extract_all("This is, suprisingly, a sentence.", boundary("word"))# 以单词为边界 [[1]] [1] "This" "is" "suprisingly" "a" [5] "sentence"
6. str_match的用法:
str_match(string, pattern)
string与pattern用法如上。
> strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569", + "387 287 6718", "apple", "233.398.9187 ", "482 952 3315", + "239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000", + "Home: 543.355.3679") > strings [1] " 219 733 8965" "329-293-8753 " [3] "banana" "595 794 7569" [5] "387 287 6718" "apple" [7] "233.398.9187 " "482 952 3315" [9] "239 923 8115 and 842 566 4692" "Work: 579-499-7527" [11] "$1000" "Home: 543.355.3679" > phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})" # 正则表达式的用法详见《正则表达式必知必会》 > str_extract(strings, phone) [1] "219 733 8965" "329-293-8753" NA "595 794 7569" "387 287 6718" [6] NA "233.398.9187" "482 952 3315" "239 923 8115" "579-499-7527" [11] NA "543.355.3679" > str_match(strings, phone) [,1] [,2] [,3] [,4] [1,] "219 733 8965" "219" "733" "8965" [2,] "329-293-8753" "329" "293" "8753" [3,] NA NA NA NA [4,] "595 794 7569" "595" "794" "7569" [5,] "387 287 6718" "387" "287" "6718" [6,] NA NA NA NA [7,] "233.398.9187" "233" "398" "9187" [8,] "482 952 3315" "482" "952" "3315" [9,] "239 923 8115" "239" "923" "8115" [10,] "579-499-7527" "579" "499" "7527" [11,] NA NA NA NA [12,] "543.355.3679" "543" "355" "3679"
7. str_pad的用法:
str_pad(string, width, side = c("left", "right", "both"), pad = " ")
string为字符串;width为空格的最小宽度;side为空格字符填充的方向;pad为填充的单个字符,默认的为空格。
> rbind( + str_pad("hadley", 30, "left"), + str_pad("hadley", 30, "right"), + str_pad("hadley", 30, "both") + ) [,1] [1,] " hadley" [2,] "hadley " [3,] " hadley "
> rbind( + str_pad("hadley", 30, "left", pad = "."), + str_pad("hadley", 30, "right", pad = "."), + str_pad("hadley", 30, "both", pad = ".") + ) [,1] [1,] "........................hadley" [2,] "hadley........................" [3,] "............hadley............"
8. str_replace的用法:
str_replace(string, pattern, replacement)
string为字符串;pattern为要替换的内容,常为正则表达式;replacement为替换者。
> fruits <- c("one apple", "two pears", "three bananas") > str_replace(fruits, "[aeiou]", "-") [1] "-ne apple" "tw- pears" "thr-e bananas" > str_replace_all(fruits, "[aeiou]", "-") [1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"
> str_replace(fruits, "([aeiou])", "") [1] "ne apple" "tw pears" "thre bananas"
> str_replace_all(str_c(fruits, collapse = "---"), c("one" = 1, "two" = 2, "three" = 3))##如果使用多个模式与替换作用于同一个字符串,可以把名称传递给模式 [1] "1 apple---2 pears---3 bananas"
9. str_split的用法:
str_split(string, pattern, n = Inf, simplify = FALSE)
str_split_fixed(string, pattern, n)
string为字符串;pattern为分离模式;n为分割为多少块;simplify:FALSE的时候返回字符串向量列表,为TRUE的时候返回字符串矩阵。
10. str_sub的用法:
str_sub(string, start = 1L, end = -1L)
string为字符串;start和end分别为开始和结束字符。
11. str_subset的用法:
str_subset(string, pattern)
string与pattern用法如上。
> fruit <- c("apple", "banana", "pear", "pinapple") > str_subset(fruit, "a") [1] "apple" "banana" "pear" "pinapple" > str_subset(fruit, "^a") [1] "apple" > str_detect(fruit, "^a") # 对比str_detect与str_subset的用法 [1] TRUE FALSE FALSE FALSE > str_subset(fruit, "a$") [1] "banana" > str_subset(fruit, "b") [1] "banana" > str_subset(fruit, "[aeiou]") [1] "apple" "banana" "pear" "pinapple"
> hw <- "Hadley Wickham" > str_sub(hw, 1, 6) [1] "Hadley" > str_sub(hw, end = 6) [1] "Hadley" > str_sub(hw, 8, 14) [1] "Wickham" > str_sub(hw, c(1,8), c(6,14)) [1] "Hadley" "Wickham" > str_sub(hw, -1) [1] "m" > str_sub(hw, -7) [1] "Wickham"
> x <- "BBCDEF" > str_sub(x, 1, 1) [1] "B" > str_sub(x, 1, 1) <- "A + > str_sub(x, 1, 1) <- "A" > x [1] "ABCDEF" > str_sub(x, -1, -1) <- "K" > x [1] "ABCDEK" > str_sub(x, -2, -2) <- "GHIJ"; x [1] "ABCDGHIJK" > str_sub(x, 2, -2) <- ""; x [1] "AK"
> fruits <- c( + "apples and oranges and pears and bananas", + "pineapples and mangos and guavas" + ) > fruits [1] "apples and oranges and pears and bananas" "pineapples and mangos and guavas" > str_split(fruits, "and") [[1]] [1] "apples " " oranges " " pears " " bananas" [[2]] [1] "pineapples " " mangos " " guavas" > str_split(fruits, "and", simplify = TRUE) [,1] [,2] [,3] [,4] [1,] "apples " " oranges " " pears " " bananas" [2,] "pineapples " " mangos " " guavas" ""
> str_split(fruits, "and", n=3) [[1]] [1] "apples " " oranges " " pears and bananas" [[2]] [1] "pineapples " " mangos " " guavas"
> str_split(fruits, "and", n=5) [[1]] [1] "apples " " oranges " " pears " " bananas" [[2]] [1] "pineapples " " mangos " " guavas" > str_split_fixed(fruits, "and", 3) [,1] [,2] [,3] [1,] "apples " " oranges " " pears and bananas" [2,] "pineapples " " mangos " " guavas" > str_split_fixed(fruits, "and", 4) # n大于分割的字符串时,多余的用空字符串表示 [,1] [,2] [,3] [,4] [1,] "apples " " oranges " " pears " " bananas" [2,] "pineapples " " mangos " " guavas" ""
> str_subset(c("a", "b", NA), ".") # 自动去掉缺失值 [1] "a" "b"
12. word的用法:
sep为单词之间的分隔符。
> sentences <- c("Jane saw a cat", "Jane sat down") > sentences [1] "Jane saw a cat" "Jane sat down" > word(sentences, 1) [1] "Jane" "Jane" > word(sentences, 2) [1] "saw" "sat" > word(sentences, -1) [1] "cat" "down" > word(sentences, 2, -1) [1] "saw a cat" "sat down" > word(sentences[1], 1:3, -1) [1] "Jane saw a cat" "saw a cat" "a cat" > word(sentences[1], 1, 1:4) [1] "Jane" "Jane saw" "Jane saw a" "Jane saw a cat" > str <- 'abc.def..123.4568.999' > word(str, 1, sep = fixed('..'))#提取分隔后的第一个 [1] "abc.def" > word(str, 2, sep = fixed('..')) #提取分隔后的第二个 [1] "123.4568.999" > word(str, 1, -1, sep = fixed('..')) [1] "abc.def..123.4568.999"