您的当前位置：首页正文

stringr包介绍

2024-11-10 来源：个人技术集锦

1. case用法：

str_to_upper(string, locale = "")
str_to_lower(string, locale = "")
str_to_title(string, locale = "")

string为要处理的字符串；locale为要翻译的...。

> dog <- "The quick brown dog"
> str_to_upper(dog)
[1] "THE QUICK BROWN DOG"
> str_to_lower(dog)
[1] "the quick brown dog"
> str_to_title(dog)
[1] "The Quick Brown Dog"
> str_to_upper("i", "en") # english
[1] "I"
> str_to_upper("i", "tr") # Turkish
[1] "İ"

2. str_c的用法：
str_c(..., sep = "", collapse = NULL)

... 为一组字符串向量；sep为插入字符串向量的字符串；collapse为把输入的字符串合并为单个字符串（默认没有）
> str_c("Letter", letters, sep = ": ")
 [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e"
 [6] "Letter: f" "Letter: g" "Letter: h" "Letter: i" "Letter: j"
[11] "Letter: k" "Letter: l" "Letter: m" "Letter: n" "Letter: o"
[16] "Letter: p" "Letter: q" "Letter: r" "Letter: s" "Letter: t"
[21] "Letter: u" "Letter: v" "Letter: w" "Letter: x" "Letter: y"
[26] "Letter: z"

> str_c(letters, collapse = "")
[1] "abcdefghijklmnopqrstuvwxyz"
> str_c(letters, collapse = ",")
[1] "a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z"
> str_c(c("a", NA, "B"), "-d")
[1] "a-d" NA    "B-d"
> str_c(str_replace_na(c("a", NA, "b")), "-d")

[1] "a-d"  "NA-d" "b-d"

3. str_count的用法：

str_count(string, pattern = "")

string为字符串；pattern为寻找模式。

> fruit <- c("apple", "banana", "pear", "pineapple")
> str_count(fruit, "a")
[1] 1 3 1 1
> str_count(fruit, "p")
[1] 2 0 1 3
> str_count(fruit, c("a","b","p","p")) # 对应每一个查找
[1] 1 1 1 3
> str_count(c("a.", "...", ".a.a"), ".") # 此处. 为正则表达式
[1] 2 3 4
> str_count(c("a.", "...", ".a.a"), fixed(".")) #fixed(".")为只查找.号，也可用"\\."
[1] 1 3 2

4. str_detect的用法：

str_detect(string, pattern)

string与pattern如3.

> str_detect(fruit, "a") # 检测是否有a
[1] TRUE TRUE TRUE TRUE
> str_detect(fruit, "^a") # 检测字符串是否以a开头
[1]  TRUE FALSE FALSE FALSE
> str_detect(fruit, "a$")  # 检测字符串是否以a结尾
[1] FALSE  TRUE FALSE FALSE

5. str_extract/str_extract_all的用法：

str_extract(string, pattern)

str_extract_all(string, pattern, simplify = FALSE)

string,pattern如上；simplify：FALSE为返回字符串向量，TRUE为返回字符串矩阵。

> shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
> str_extract(shopping_list, "\\d") # \\d+ 更好一些 
[1] "4" NA  NA  "2"

> str_extract(shopping_list, "[a-z]+")
[1] "apples" "bag"    "bag"    "milk"

> str_extract(shopping_list, "[a-z]{1,4}")
[1] "appl" "bag"  "bag"  "milk"
> str_extract(shopping_list, "\\b[a-z]{1,4}\\b") # \\b 为边界 
[1] NA     "bag"  "bag"  "milk"
> str_extract_all(shopping_list, "[a-z]+") #由此看出str_extract与str_extract_all的不同
[[1]]
[1] "apples" "x"     

[[2]]
[1] "bag"   "of"    "flour"

[[3]]
[1] "bag"   "of"    "sugar"

[[4]]
[1] "milk" "x"   

> str_extract_all(shopping_list, "\\b[a-z]+\\b")
[[1]]
[1] "apples"

[[2]]
[1] "bag"   "of"    "flour"

[[3]]
[1] "bag"   "of"    "sugar"

[[4]]

> str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE) # 生成字符串矩阵
     [,1]     [,2] [,3]   
[1,] "apples" ""   ""     
[2,] "bag"    "of" "flour"
[3,] "bag"    "of" "sugar"
[4,] "milk"   ""   ""     
> str_extract_all("This is, suprisingly, a sentence.", boundary("word"))# 以单词为边界
[[1]]
[1] "This"        "is"          "suprisingly" "a"          
[5] "sentence"

6. str_match的用法：

str_match(string, pattern)

string与pattern用法如上。

> strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569",
+              "387 287 6718", "apple", "233.398.9187 ", "482 952 3315",
+              "239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000",
+              "Home: 543.355.3679")
> strings
 [1] " 219 733 8965"                 "329-293-8753 "                
 [3] "banana"                        "595 794 7569"                 
 [5] "387 287 6718"                  "apple"                        
 [7] "233.398.9187 "                 "482 952 3315"                 
 [9] "239 923 8115 and 842 566 4692" "Work: 579-499-7527"           
[11] "$1000"                         "Home: 543.355.3679"           
> phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})" # 正则表达式的用法详见《正则表达式必知必会》
> str_extract(strings, phone)
 [1] "219 733 8965" "329-293-8753" NA             "595 794 7569" "387 287 6718"
 [6] NA             "233.398.9187" "482 952 3315" "239 923 8115" "579-499-7527"
[11] NA             "543.355.3679"
> str_match(strings, phone)
      [,1]           [,2]  [,3]  [,4]  
 [1,] "219 733 8965" "219" "733" "8965"
 [2,] "329-293-8753" "329" "293" "8753"
 [3,] NA             NA    NA    NA    
 [4,] "595 794 7569" "595" "794" "7569"
 [5,] "387 287 6718" "387" "287" "6718"
 [6,] NA             NA    NA    NA    
 [7,] "233.398.9187" "233" "398" "9187"
 [8,] "482 952 3315" "482" "952" "3315"
 [9,] "239 923 8115" "239" "923" "8115"
[10,] "579-499-7527" "579" "499" "7527"
[11,] NA             NA    NA    NA    
[12,] "543.355.3679" "543" "355" "3679"

7. str_pad的用法：

str_pad(string, width, side = c("left", "right", "both"), pad = " ")

string为字符串；width为空格的最小宽度；side为空格字符填充的方向；pad为填充的单个字符，默认的为空格。

> rbind(
+     str_pad("hadley", 30, "left"),
+     str_pad("hadley", 30, "right"),
+     str_pad("hadley", 30, "both")
+ )
     [,1]                            
[1,] "                        hadley"
[2,] "hadley                        "
[3,] "            hadley            "

> rbind(
+     str_pad("hadley", 30, "left", pad = "."),
+     str_pad("hadley", 30, "right", pad = "."),
+     str_pad("hadley", 30, "both", pad = ".")
+ )
     [,1]                            
[1,] "........................hadley"
[2,] "hadley........................"
[3,] "............hadley............"

8. str_replace的用法：

str_replace(string, pattern, replacement)

string为字符串；pattern为要替换的内容，常为正则表达式；replacement为替换者。

> fruits <- c("one apple", "two pears", "three bananas")
> str_replace(fruits, "[aeiou]", "-")
[1] "-ne apple"     "tw- pears"     "thr-e bananas"
> str_replace_all(fruits, "[aeiou]", "-")
[1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"

> str_replace(fruits, "([aeiou])", "")
[1] "ne apple"     "tw pears"     "thre bananas"

> str_replace_all(str_c(fruits, collapse = "---"), c("one" = 1, "two" = 2, "three" = 3))##如果使用多个模式与替换作用于同一个字符串，可以把名称传递给模式 
[1] "1 apple---2 pears---3 bananas"

9. str_split的用法：

str_split(string, pattern, n = Inf, simplify = FALSE)

str_split_fixed(string, pattern, n)

string为字符串；pattern为分离模式；n为分割为多少块；simplify：FALSE的时候返回字符串向量列表，为TRUE的时候返回字符串矩阵。

10. str_sub的用法：

str_sub(string, start = 1L, end = -1L)

string为字符串；start和end分别为开始和结束字符。

11. str_subset的用法：

str_subset(string, pattern)

string与pattern用法如上。

> fruit <- c("apple", "banana", "pear", "pinapple")
> str_subset(fruit, "a")
[1] "apple"    "banana"   "pear"     "pinapple"
> str_subset(fruit, "^a")
[1] "apple"
> str_detect(fruit, "^a") # 对比str_detect与str_subset的用法
[1]  TRUE FALSE FALSE FALSE
> str_subset(fruit, "a$")
[1] "banana"
> str_subset(fruit, "b")
[1] "banana"
> str_subset(fruit, "[aeiou]")
[1] "apple"    "banana"   "pear"     "pinapple"

> hw <- "Hadley Wickham"
> str_sub(hw, 1, 6)
[1] "Hadley"
> str_sub(hw, end = 6)
[1] "Hadley"
> str_sub(hw, 8, 14)
[1] "Wickham"
> str_sub(hw, c(1,8), c(6,14))
[1] "Hadley"  "Wickham"
> str_sub(hw, -1)
[1] "m"
> str_sub(hw, -7)
[1] "Wickham"

> x <- "BBCDEF"
> str_sub(x, 1, 1)
[1] "B"
> str_sub(x, 1, 1) <- "A
+ 

> str_sub(x, 1, 1) <- "A"
> x
[1] "ABCDEF"
> str_sub(x, -1, -1) <- "K"
> x
[1] "ABCDEK"
> str_sub(x, -2, -2) <- "GHIJ"; x
[1] "ABCDGHIJK"
> str_sub(x, 2, -2) <- ""; x
[1] "AK"

> fruits <- c(
+     "apples and oranges and pears and bananas",
+     "pineapples and mangos and guavas"
+ )
> fruits
[1] "apples and oranges and pears and bananas" "pineapples and mangos and guavas"        
> str_split(fruits, "and")
[[1]]
[1] "apples "   " oranges " " pears "   " bananas" 

[[2]]
[1] "pineapples " " mangos "    " guavas"    

> str_split(fruits, "and", simplify = TRUE)
     [,1]          [,2]        [,3]      [,4]      
[1,] "apples "     " oranges " " pears " " bananas"
[2,] "pineapples " " mangos "  " guavas" ""

> str_split(fruits, "and", n=3)
[[1]]
[1] "apples "            " oranges "          " pears and bananas"

[[2]]
[1] "pineapples " " mangos "    " guavas"

> str_split(fruits, "and", n=5)
[[1]]
[1] "apples "   " oranges " " pears "   " bananas" 

[[2]]
[1] "pineapples " " mangos "    " guavas"    

> str_split_fixed(fruits, "and", 3)
     [,1]          [,2]        [,3]                
[1,] "apples "     " oranges " " pears and bananas"
[2,] "pineapples " " mangos "  " guavas"           
> str_split_fixed(fruits, "and", 4) # n大于分割的字符串时，多余的用空字符串表示
     [,1]          [,2]        [,3]      [,4]      
[1,] "apples "     " oranges " " pears " " bananas"
[2,] "pineapples " " mangos "  " guavas" ""

> str_subset(c("a", "b", NA), ".") # 自动去掉缺失值
[1] "a" "b"

12. word的用法：

word(string, start = 1L, end = start, sep = fixed(" "))
sep为单词之间的分隔符。

> sentences <- c("Jane saw a cat", "Jane sat down")
> sentences
[1] "Jane saw a cat" "Jane sat down" 
> word(sentences, 1)
[1] "Jane" "Jane"
> word(sentences, 2)
[1] "saw" "sat"
> word(sentences, -1)
[1] "cat"  "down"
> word(sentences, 2, -1)
[1] "saw a cat" "sat down" 
> word(sentences[1], 1:3, -1)
[1] "Jane saw a cat" "saw a cat"      "a cat"         
> word(sentences[1], 1, 1:4)
[1] "Jane"           "Jane saw"       "Jane saw a"     "Jane saw a cat"
> str <- 'abc.def..123.4568.999'
> word(str, 1, sep = fixed('..'))#提取分隔后的第一个
[1] "abc.def"
> word(str, 2, sep = fixed('..')) #提取分隔后的第二个
[1] "123.4568.999"
> word(str, 1, -1, sep = fixed('..'))
[1] "abc.def..123.4568.999"