Exercises with regular expressions
dataset <- data.frame(Patient.ID=c("normal_01", "normal_02", "normal_03", "tumor_01", "tumor_02", "tumor_02"),
Sentrix.position=c("A01B01", "A01B02", "A016A01", "B02A02", "C01D02", "C02C01"), Treatment=c("Treated", "Treated", "Not treated", "Treated", "Treated", "Not treated"), value=c(3.25, 3.67, 4.26, 6.24, 5.78, 7.32), row.names = c("Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"))
- Create a column with sample type (tumor or normal)
- table treatment versus sample type
- add an "_" to the sample names: sample_3
- summarize all values that are coming from normal samples
- change all “A”s in the Sentrix.position column to “E”s.
- change all “E”s back to “A”s, if they appear second. Do it as generalized as possible.
#Examples:
grep("normal", dataset$Patient.ID)
## [1] 1 2 3
grep("norm", dataset$Patient.ID)
## [1] 1 2 3
grep("nom", dataset$Patient.ID)
## integer(0)
grepl("normal", dataset$Patient.ID)
## [1] TRUE TRUE TRUE FALSE FALSE FALSE
grepl("[[:alpha:]]", dataset$Patient.ID)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
grepl("[[:alpha:]]{5}", dataset$Patient.ID)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
grepl("[[:alpha:]]{6}", dataset$Patient.ID)
## [1] TRUE TRUE TRUE FALSE FALSE FALSE
grepl("[[:alpha:]]_[[:digit:]]", dataset$Patient.ID)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
grepl("[[:alpha:]]{6}_[[:digit:]]{2}", dataset$Patient.ID)
## [1] TRUE TRUE TRUE FALSE FALSE FALSE
regexec("[[:alpha:]]_[[:digit:]]", dataset$Patient.ID)
## [[1]]
## [1] 6
## attr(,"match.length")
## [1] 3
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[2]]
## [1] 6
## attr(,"match.length")
## [1] 3
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[3]]
## [1] 6
## attr(,"match.length")
## [1] 3
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[4]]
## [1] 5
## attr(,"match.length")
## [1] 3
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[5]]
## [1] 5
## attr(,"match.length")
## [1] 3
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[6]]
## [1] 5
## attr(,"match.length")
## [1] 3
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
gregexpr("[[:alpha:]]_[[:digit:]]", dataset$Patient.ID)
## [[1]]
## [1] 6
## attr(,"match.length")
## [1] 3
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[2]]
## [1] 6
## attr(,"match.length")
## [1] 3
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[3]]
## [1] 6
## attr(,"match.length")
## [1] 3
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[4]]
## [1] 5
## attr(,"match.length")
## [1] 3
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[5]]
## [1] 5
## attr(,"match.length")
## [1] 3
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
##
## [[6]]
## [1] 5
## attr(,"match.length")
## [1] 3
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
gsub("_", ".", dataset$Patient.ID)
## [1] "normal.01" "normal.02" "normal.03" "tumor.01" "tumor.02" "tumor.02"
gsub(".", "_", dataset$Patient.ID)
## [1] "_________" "_________" "_________" "________" "________" "________"
gsub("\\.", "_", dataset$Patient.ID)
## [1] "normal_01" "normal_02" "normal_03" "tumor_01" "tumor_02" "tumor_02"
gsub(".", "_", dataset$Patient.ID, fixed = T)
## [1] "normal_01" "normal_02" "normal_03" "tumor_01" "tumor_02" "tumor_02"
gsub("([[:alpha:]]{5,6})_([[:digit:]]{2})", "\\2", dataset$Patient.ID)
## [1] "01" "02" "03" "01" "02" "02"
gsub("([[:alpha:]]{5,6})_([[:digit:]]{2})", "\\1", dataset$Patient.ID)
## [1] "normal" "normal" "normal" "tumor" "tumor" "tumor"
gsub("([A-Za-z]{5,6})_([[:digit:]]{2})", "\\1", dataset$Patient.ID)
## [1] "normal" "normal" "normal" "tumor" "tumor" "tumor"
dataset$Sample_type <- gsub("([A-Za-z]{5,6})_([[:digit:]]{2})", "\\1", dataset$Patient.ID)
rownames(dataset) <- gsub("Sample", "Sample_", rownames(dataset))