Repetitive coding tasks

You may often find yourself in a situation where you need to do the same thing (or set of things) in R, over and over again. This is true in my (real-world) example of working with US patent data. This (very clunky) database allows you to download all of the patents granted by week. What’s more is that the data are not so clean, and so each csv requires some cleaning be done on each of them. So if we wanted to aggregate clean patent data over multiple weeks, months, or years, we would need to run the same chunk of code (reading in and cleaning the data) over 100 times, each time with only minor alteration. This is the kind of thing that we can use function writing and iteration to our advantage.

Note: How should we even be reading in and storing lots of data like this? I store my data on Box and download the Box app onto my computer so that I can find an easy filepath there. Here are the files I am working with for this example: 104 weeks of US patent data for 1977 and 1978

list.files("~/Box/d-rug/data/")

##   [1] "uspto_1977_1.csv"  "uspto_1977_10.csv" "uspto_1977_11.csv"
##   [4] "uspto_1977_12.csv" "uspto_1977_13.csv" "uspto_1977_14.csv"
##   [7] "uspto_1977_15.csv" "uspto_1977_16.csv" "uspto_1977_17.csv"
##  [10] "uspto_1977_18.csv" "uspto_1977_19.csv" "uspto_1977_2.csv" 
##  [13] "uspto_1977_20.csv" "uspto_1977_21.csv" "uspto_1977_22.csv"
##  [16] "uspto_1977_23.csv" "uspto_1977_24.csv" "uspto_1977_25.csv"
##  [19] "uspto_1977_26.csv" "uspto_1977_27.csv" "uspto_1977_28.csv"
##  [22] "uspto_1977_29.csv" "uspto_1977_3.csv"  "uspto_1977_30.csv"
##  [25] "uspto_1977_31.csv" "uspto_1977_32.csv" "uspto_1977_33.csv"
##  [28] "uspto_1977_34.csv" "uspto_1977_35.csv" "uspto_1977_36.csv"
##  [31] "uspto_1977_37.csv" "uspto_1977_38.csv" "uspto_1977_39.csv"
##  [34] "uspto_1977_4.csv"  "uspto_1977_40.csv" "uspto_1977_41.csv"
##  [37] "uspto_1977_42.csv" "uspto_1977_43.csv" "uspto_1977_44.csv"
##  [40] "uspto_1977_45.csv" "uspto_1977_46.csv" "uspto_1977_47.csv"
##  [43] "uspto_1977_48.csv" "uspto_1977_49.csv" "uspto_1977_5.csv" 
##  [46] "uspto_1977_50.csv" "uspto_1977_51.csv" "uspto_1977_52.csv"
##  [49] "uspto_1977_6.csv"  "uspto_1977_7.csv"  "uspto_1977_8.csv" 
##  [52] "uspto_1977_9.csv"  "uspto_1978_1.csv"  "uspto_1978_10.csv"
##  [55] "uspto_1978_11.csv" "uspto_1978_12.csv" "uspto_1978_13.csv"
##  [58] "uspto_1978_14.csv" "uspto_1978_15.csv" "uspto_1978_16.csv"
##  [61] "uspto_1978_17.csv" "uspto_1978_18.csv" "uspto_1978_19.csv"
##  [64] "uspto_1978_2.csv"  "uspto_1978_20.csv" "uspto_1978_21.csv"
##  [67] "uspto_1978_22.csv" "uspto_1978_23.csv" "uspto_1978_24.csv"
##  [70] "uspto_1978_25.csv" "uspto_1978_26.csv" "uspto_1978_27.csv"
##  [73] "uspto_1978_28.csv" "uspto_1978_29.csv" "uspto_1978_3.csv" 
##  [76] "uspto_1978_30.csv" "uspto_1978_31.csv" "uspto_1978_32.csv"
##  [79] "uspto_1978_33.csv" "uspto_1978_34.csv" "uspto_1978_35.csv"
##  [82] "uspto_1978_36.csv" "uspto_1978_37.csv" "uspto_1978_38.csv"
##  [85] "uspto_1978_39.csv" "uspto_1978_4.csv"  "uspto_1978_40.csv"
##  [88] "uspto_1978_41.csv" "uspto_1978_42.csv" "uspto_1978_43.csv"
##  [91] "uspto_1978_44.csv" "uspto_1978_45.csv" "uspto_1978_46.csv"
##  [94] "uspto_1978_47.csv" "uspto_1978_48.csv" "uspto_1978_49.csv"
##  [97] "uspto_1978_5.csv"  "uspto_1978_50.csv" "uspto_1978_51.csv"
## [100] "uspto_1978_52.csv" "uspto_1978_6.csv"  "uspto_1978_7.csv" 
## [103] "uspto_1978_8.csv"  "uspto_1978_9.csv"

What would this look like if I were to go through each files one by one…

# Load in that lubridate package first
library(lubridate)
patent77_1 <- read.csv("~/Box/d-rug/data/uspto_1977_1.csv")
patent77_1$App_Date <- ymd(as.character(patent77_1$App_Date))
patent77_1$Issue_Date <- ymd(as.character(patent77_1$Issue_Date))

patent77_2 <- read.csv("~/Box/d-rug/data/uspto_1977_2.csv")
patent77_2$App_Date <- ymd(as.character(patent77_2$App_Date))
patent77_2$Issue_Date <- ymd(as.character(patent77_2$Issue_Date))

patent77_3 <- read.csv("~/Box/d-rug/data/uspto_1977_3.csv")
patent77_3$App_Date <- ymd(as.character(patent77_3$App_Date))
patent77_3$Issue_Date <- ymd(as.character(patent77_3$Issue_Date))

patent77_4 <- read.csv("~/Box/d-rug/data/uspto_1977_4.csv")
patent77_4$App_Date <- ymd(as.character(patent77_4$App_Date))
patent77_4$Issue_Date <- ymd(as.character(patent77_4$Issue_Date))
# and so on ...

Each file will look something like this:

summary(patent77_1)

##      WKU               Title              App_Date         
##  Length:1484        Length:1484        Min.   :1960-10-18  
##  Class :character   Class :character   1st Qu.:1974-12-26  
##  Mode  :character   Mode  :character   Median :1975-05-28  
##                                        Mean   :1975-02-22  
##                                        3rd Qu.:1975-09-25  
##                                        Max.   :1976-07-01  
##    Issue_Date           Inventor           Assignee          ICL_Class        
##  Min.   :1977-01-04   Length:1484        Length:1484        Length:1484       
##  1st Qu.:1977-01-04   Class :character   Class :character   Class :character  
##  Median :1977-01-04   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1977-01-04                                                           
##  3rd Qu.:1977-01-04                                                           
##  Max.   :1977-01-04                                                           
##   References           Claims         
##  Length:1484        Length:1484       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##

… Then do that 100 more times. And if you want to edit it, edit it over 100 times, and again and again until you lose your mind. So how can we make this more efficient?