En R existen dos tipos de bases de datos
data.frame()ytibble()que son las bases de datos detidyverseel mejor paquete para manipulacion de datos.
# Un data.frame se convierte facilmente
tib = as_tibble(df)En esta clase nos vamos a centrar en el uso de
tidyverseylubridateun paquete con funciones para trabajar con fechas. El paquetezootiene algunas funciones adicionales para trabajar con fechas.
Setear directorio de trabajo
getwd() # Para ver en que directorio estamos trabajando
setwd('C:/Documentos/CursoR') # Definir directorio. Notar barras invertidas en la rutaColoco rutas de acceso en una variable para usar mas adelante
datos_ts = 'C:/Users/msang/OneDrive - BCRA/CursoR/2021/Clase01/datos_ts.xlsx'
datos_pd = 'C:/Users/msang/OneDrive - BCRA/CursoR/2021/Clase02/datos_wb.xlsx'Carga de datos
# CSV
bd = read.csv("b_datos.csv", header=TRUE, stringsAsFactors=TRUE, sep=",")
# TXT / Esta es la mas eficiente porque permite paralelizar con multithread
bd = data.table::fread('b_datos.txt', header=TRUE, stringsAsFactors=F, sep='\t', nThread=2)
bd = read.delim('datos/b_datos.txt', header=TRUE, stringsAsFactors=F, sep='\t')# Excel
bd = readxl::read_excel(datos_pd, sheet='1') # En datos_pd esta la ruta de acceso completa | year | cname | ccode | gdp_pc2010 | gdp_pc2017 | gdp_2010 | credit_ps | inv | exports | imports | popu | 
|---|---|---|---|---|---|---|---|---|---|---|
| 2,011 | Argentina | ARG | 10,883 | 24,648 | 449,061,807,872 | 14.0 | 17.2 | 18.4 | 16.8 | 41,261,490 | 
| 2,011 | Brazil | BRA | 11,628 | 15,323 | 2,296,661,602,092 | 58.1 | 20.6 | 11.6 | 12.4 | 197,514,541 | 
| 2,011 | Chile | CHL | 13,456 | 22,338 | 231,892,203,586 | 101.3 | 23.1 | 37.8 | 34.4 | 17,233,584 | 
| 2,011 | France | FRA | 41,369 | 42,864 | 2,703,188,933,543 | 96.8 | 22.4 | 28.4 | 30.4 | 65,342,789 | 
| 2,011 | Italy | ITA | 36,228 | 42,892 | 2,151,209,302,546 | 94.1 | 19.7 | 26.9 | 28.3 | 59,379,449 | 
| 2,011 | United Kingdom | GBR | 39,729 | 42,294 | 2,513,237,223,478 | 170.6 | 15.6 | 30.7 | 31.7 | 63,258,810 | 
Bases de STATA
library(heaven)
read_dta()
write_dta()Ingrasar datos con tidyverse
| Comando | Separador | 
|---|---|
read_csv() | 
coma | 
read_csv2() | 
punto y coma | 
read_tsv() | 
tab | 
read_delim() | 
otros | 
Problemas con la imputacion de datos (parse)
Ver mas especificaciones aqui
# Parsear vectores
library(readr)
str(parse_logical(c("TRUE", "FALSE", "NA")))##  logi [1:3] TRUE FALSE NA
str(parse_integer(c("1", "2", "3")))##  int [1:3] 1 2 3
str(parse_date(c("2010-01-01", "1979-10-14")))##  Date[1:2], format: "2010-01-01" "1979-10-14"
parse_integer(c("1", "231", ".", "456"), na = ".")## [1]   1 231  NA 456
parse_double("1,23", locale = locale(decimal_mark = ","))## [1] 1.23
# Base de datos
challenge <- read_csv(
  readr_example("challenge.csv"), 
  col_types = cols(
    x = col_double(),
    y = col_date()
  )
)
tail(challenge)## # A tibble: 6 x 2
##       x y         
##   <dbl> <date>    
## 1 0.805 2019-11-21
## 2 0.164 2018-03-29
## 3 0.472 2014-08-04
## 4 0.718 2015-08-16
## 5 0.270 2020-02-04
## 6 0.608 2019-01-06
Output de datos
# CSV
write.csv(bd,"b_datos.csv")
write_csv()
write_excel_csv()
# TXT 
write_delim()
write_tsv()
# Excel
library("xlsx")
# Primera base de datos
write.xlsx(USArrests, file = "b_datos.xlsx", sheetName = "IRIS", append = FALSE)
# Segunda base de datos
write.xlsx(mtcars, file = "b_datos.xlsx", sheetName="MTCARS", append=TRUE)Variables
library(tidyverse)
bd1 = bd %>% 
  mutate(gdp_pc2010bis = gdp_2010 / popu,   # crear
         open = exports + imports,
         inv_demean = inv - mean(inv)) %>%
  rename(poblacion = popu) %>%              # rename
  mutate(gdp_2010 = NULL)                   # drop (tambien con select())| year | cname | gdp_pc2010 | gdp_pc2010bis | open | inv_demean | 
|---|---|---|---|---|---|
| 2,011 | Argentina | 10,883 | 10,883 | 35.2 | -2 | 
| 2,011 | Brazil | 11,628 | 11,628 | 23.9 | 2 | 
| 2,011 | Chile | 13,456 | 13,456 | 72.2 | 4 | 
| 2,011 | France | 41,369 | 41,369 | 58.8 | 4 | 
| 2,011 | Italy | 36,228 | 36,228 | 55.1 | 1 | 
| 2,011 | United Kingdom | 39,729 | 39,729 | 62.4 | -3 | 
Merge
meta = readxl::read_excel(datos_pd, sheet='2')
bd = left_join(bd, meta, by=c('ccode'))| year | cname | region | 
|---|---|---|
| 2011 | Argentina | Latin America & Caribbean | 
| 2011 | Brazil | Latin America & Caribbean | 
| 2011 | Chile | Latin America & Caribbean | 
| 2011 | France | Europe & Central Asia | 
| 2011 | Italy | Europe & Central Asia | 
| 2011 | United Kingdom | Europe & Central Asia | 
group_by , mutate
bd$id = as.numeric(factor(bd$ccode))
# Si quiero usar una funcion propia
demean = function(x) {x - mean(x, na.rm = TRUE)}
bd = bd %>% 
  mutate(open = exports + imports) %>%
  select(ccode, year, region, gdp_pc2017, credit_ps, inv, id, open) %>%
    arrange(ccode, year) %>%
  group_by(ccode) %>%
  mutate(obs = seq(1:length(ccode)),     # igual con row_number()
         gdp_gr = 100 * (gdp_pc2017 / lag(gdp_pc2017,1) - 1),
         credit_ps_mean = mean(credit_ps, na.rm = TRUE),
         dev = if_else(region=='Latin America & Caribbean', 0, 1),
         gdp_dem = demean(gdp_pc2017)) %>%
  ungroup()
head(bd[c('ccode', 'dev', 'year', 'gdp_pc2017', 'gdp_gr', 'gdp_dem')],10)## # A tibble: 10 x 6
##    ccode   dev  year gdp_pc2017 gdp_gr  gdp_dem
##    <chr> <dbl> <dbl>      <dbl>  <dbl>    <dbl>
##  1 ARG       0  2011     24648.  NA     1451.  
##  2 ARG       0  2012     24119.  -2.15   922.  
##  3 ARG       0  2013     24424.   1.27  1227.  
##  4 ARG       0  2014     23550.  -3.58   353.  
##  5 ARG       0  2015     23934.   1.63   737.  
##  6 ARG       0  2016     23190.  -3.11    -7.58
##  7 ARG       0  2017     23597.   1.76   400.  
##  8 ARG       0  2018     22759.  -3.55  -438.  
##  9 ARG       0  2019     22064.  -3.06 -1133.  
## 10 ARG       0  2020     19687. -10.8  -3511.
# case_when() permite evaluar mas de 2 alternativas
df <- tibble(
  a = seq(1,5)
)
df = df %>% mutate(b = case_when(a <= 2 ~ 1,
                                 a > 2 & a <= 4 ~ 2,
                                 TRUE ~ as.double(a)))
df## # A tibble: 5 x 2
##       a     b
##   <int> <dbl>
## 1     1     1
## 2     2     1
## 3     3     2
## 4     4     2
## 5     5     5
Guardar una base de datos en R
bd = bd %>% select(ccode, year, region, gdp_gr, credit_ps, inv, open)
save(bd, file="datos_wb.rda")| ccode | year | region | gdp_gr | credit_ps | inv | open | 
|---|---|---|---|---|---|---|
| ARG | 2018 | Latin America & Caribbean | -3.6 | NA | 14.7 | 31.2 | 
| ARG | 2019 | Latin America & Caribbean | -3.1 | NA | 13.5 | 32.6 | 
| ARG | 2020 | Latin America & Caribbean | -10.8 | NA | 13.4 | 30.5 | 
| BRA | 2018 | Latin America & Caribbean | 1.0 | 60.2 | 15.1 | 28.9 | 
| BRA | 2019 | Latin America & Caribbean | 0.7 | 62.6 | 15.3 | 28.5 | 
| BRA | 2020 | Latin America & Caribbean | -4.7 | 70.2 | 16.4 | 32.4 | 
| GBR | 2018 | Europe & Central Asia | 0.6 | 134.6 | 17.8 | 63.0 | 
| GBR | 2019 | Europe & Central Asia | 0.8 | 133.5 | 18.0 | 63.4 | 
| GBR | 2020 | Europe & Central Asia | -10.3 | 146.4 | 17.6 | 55.1 | 
| ITA | 2018 | Europe & Central Asia | 1.1 | 76.7 | 17.8 | 60.3 | 
| ITA | 2019 | Europe & Central Asia | 1.5 | 74.3 | 18.0 | 60.1 | 
| ITA | 2020 | Europe & Central Asia | -8.6 | 83.6 | 17.8 | 55.3 | 
Loop sobre variables
set.seed(1234)
df <- tibble(
  a = runif(100, min=0, max=100),
  b = rnorm(100, 0, 1),
  c = rnorm(100, mean=5, sd=3),
)
head(df,3)## # A tibble: 3 x 3
##       a      b     c
##   <dbl>  <dbl> <dbl>
## 1  11.4 -1.81   3.87
## 2  62.2 -0.582  5.29
## 3  60.9 -1.11   9.92
# Reemplazar variables existentes
vars = names(df)
for (v in vars) {
  df[v] = df[v] * 100
}
head(df,3)## # A tibble: 3 x 3
##       a      b     c
##   <dbl>  <dbl> <dbl>
## 1 1137. -181.   387.
## 2 6223.  -58.2  529.
## 3 6093. -111.   992.
# Generar varaibles nuevas
# Dividir las dos ultimas (b y c) por la primera (a)
set.seed(1234)
df1 <- tibble(
  a = rep(2, 100),
  b = rnorm(100, 0, 1),
  c = rnorm(100, 5, 3),
)
vars = names(df1[2:length(df1)])
for (v in vars) {
  df1[paste0(v,'_a')] = df1[v] / df1[['a']]
}
head(df1,3)## # A tibble: 3 x 5
##       a      b     c    b_a   c_a
##   <dbl>  <dbl> <dbl>  <dbl> <dbl>
## 1     2 -1.21   6.24 -0.604  3.12
## 2     2  0.277  3.58  0.139  1.79
## 3     2  1.08   5.20  0.542  2.60
Valores missing
# Son tratados como los valores mas grandes de todos pero el replace los respeta
df2 <- tibble(
  a = sample(1:5, 5, replace = F),
  b = seq(5,1),
)
df2## # A tibble: 5 x 2
##       a     b
##   <int> <int>
## 1     1     5
## 2     5     4
## 3     3     3
## 4     4     2
## 5     2     1
df2 = df2 %>% mutate(b = ifelse(b == 2, NA, b)) 
df2 = df2 %>% mutate(c = ifelse(b > 3, 0, b))
df2 = df2 %>% arrange(b)
df2## # A tibble: 5 x 3
##       a     b     c
##   <int> <int> <dbl>
## 1     2     1     1
## 2     3     3     3
## 3     5     4     0
## 4     1     5     0
## 5     4    NA    NA
Volvemos a la base de WB
# Recordamos la estructura
str(bd)## tibble [60 x 7] (S3: tbl_df/tbl/data.frame)
##  $ ccode    : chr [1:60] "ARG" "ARG" "ARG" "ARG" ...
##  $ year     : num [1:60] 2011 2012 2013 2014 2015 ...
##  $ region   : chr [1:60] "Latin America & Caribbean" "Latin America & Caribbean" "Latin America & Caribbean" "Latin America & Caribbean" ...
##  $ gdp_gr   : num [1:60] NA -2.15 1.27 -3.58 1.63 ...
##  $ credit_ps: num [1:60] 14 15.2 15.7 13.8 14.4 ...
##  $ inv      : num [1:60] 17.2 15.9 16.3 16 15.6 ...
##  $ open     : num [1:60] 35.2 30.5 29.3 28.4 22.5 ...
summary(bd[,1:4])##     ccode                year         region              gdp_gr        
##  Length:60          Min.   :2011   Length:60          Min.   :-10.7750  
##  Class :character   1st Qu.:2013   Class :character   1st Qu.: -2.7656  
##  Mode  :character   Median :2016   Mode  :character   Median :  0.7100  
##                     Mean   :2016                      Mean   : -0.6777  
##                     3rd Qu.:2018                      3rd Qu.:  1.4095  
##                     Max.   :2020                      Max.   :  4.3092  
##                                                       NA's   :6
Eliminar valores missing (elimina la fila completa)
bd1 = na.omit(bd)
nrow(bd1)## [1] 50
sum(ifelse(is.na(bd$credit_ps),1,0)) # cuenta valores missing de CreditoSPriv## [1] 4
rm('bd1')Pivot (reshape)
# Long
bdAR = bd %>% 
         filter(ccode=='ARG') %>%
         select(year, credit_ps, inv) %>%
         pivot_longer(cols=-year, names_to="Var", values_to="Val") %>%
         arrange(year, desc(Var))    # Sort| year | Var | Val | 
|---|---|---|
| 2011 | inv | 17.2 | 
| 2011 | credit_ps | 14.0 | 
| 2012 | inv | 15.9 | 
| 2012 | credit_ps | 15.2 | 
| 2013 | inv | 16.3 | 
| 2013 | credit_ps | 15.7 | 
# Wide
bdAR2 = bdAR %>% 
         filter(Var=='credit_ps', year <=2015) %>%
         mutate(ccode = 'ARG') %>%
         pivot_wider(id_cols=ccode, names_from=year, values_from=Val) %>%
         rename_with(~ paste0("CREDIT", 2011:2015), where(is.numeric))| ccode | CREDIT2011 | CREDIT2012 | CREDIT2013 | CREDIT2014 | CREDIT2015 | 
|---|---|---|---|---|---|
| ARG | 14 | 15.2 | 15.7 | 13.8 | 14.4 | 
# Otra forma de llevar a long starts_with()
bdAR3 = bdAR2 %>% 
       pivot_longer(cols = starts_with("CREDIT"), names_to="Var", values_to="Val") %>% 
       separate(Var, c("V","year"), sep = 6)
#unite() para concatenar variables| ccode | V | year | Val | 
|---|---|---|---|
| ARG | CREDIT | 2011 | 14.0 | 
| ARG | CREDIT | 2012 | 15.2 | 
| ARG | CREDIT | 2013 | 15.7 | 
| ARG | CREDIT | 2014 | 13.8 | 
| ARG | CREDIT | 2015 | 14.4 | 
Append
bdAR = bd %>%
  filter(ccode=="ARG",
         year>=2018) %>%
  select(year, ccode, gdp_gr)
bdBR = bd %>%
  filter(ccode=="BRA",
         year>=2018) %>%
  select(year, ccode, gdp_gr)
bdARBR = rbind(bdAR, bdBR)
# Ver bind_rows() de tidyverse| year | ccode | gdp_gr | 
|---|---|---|
| 2018 | ARG | -3.6 | 
| 2019 | ARG | -3.1 | 
| 2020 | ARG | -10.8 | 
| 2018 | BRA | 1.0 | 
| 2019 | BRA | 0.7 | 
| 2020 | BRA | -4.7 | 
Trabajar con strings
words = c('uno','dos', 'tres')
length(words)   # cuenta los elementos del vector## [1] 3
str_length(words)## [1] 3 3 4
words = gsub('tres','cinco',words)
words## [1] "uno"   "dos"   "cinco"
words = str_replace(words, "uno", "diez")
words## [1] "diez"  "dos"   "cinco"
hi = 'hola'
nchar(hi)## [1] 4
Trabajar con fechas
Year
%Y (4 digitos).
%y (2 digitos); 00-69 -> 2000-2069, 70-99 -> 1970-1999.
Month
%m (2 digitos).
%b (nombre abreviado, “Jan”).
%B (nombre completo, “January”).
Day
%d (2 digitos).
library(zoo)
library(lubridate)
datos = readxl::read_excel(datos_ts, sheet='datos')
datos$fecha = as.Date(datos$fecha, format = '%Y-%m-%d')
datos$fecha2 = datos$fecha + days(15)
datos$fecha3 = floor_date(datos$fecha2, 'month')
datos$mes = month(datos$fecha)
datos$year = year(datos$fecha)
datos = datos[,c(1,5,6,7,8,2,3,4)] # reordena la base de datos| fecha | …5 | fecha2 | fecha3 | mes | ipc | tcn | emae_sa | 
|---|---|---|---|---|---|---|---|
| 2015-01-01 | 2015q1 | 2015-01-16 | 2015-01-01 | 1 | 57.6 | 8.6 | 144.2 | 
| 2015-02-01 | 2015q1 | 2015-02-16 | 2015-02-01 | 2 | 58.5 | 8.7 | 148.0 | 
| 2015-03-01 | 2015q1 | 2015-03-16 | 2015-03-01 | 3 | 59.5 | 8.8 | 147.7 | 
| 2015-04-01 | 2015q2 | 2015-04-16 | 2015-04-01 | 4 | 60.9 | 8.9 | 149.8 | 
| 2015-05-01 | 2015q2 | 2015-05-16 | 2015-05-01 | 5 | 62.2 | 9.0 | 149.7 | 
| 2015-06-01 | 2015q2 | 2015-06-16 | 2015-06-01 | 6 | 63.0 | 9.1 | 150.6 | 
Datos trimestrales (zoo)
datosq <- readxl::read_excel(datos_ts, sheet='trim')  %>%
  mutate(fecha=as.Date(as.yearqtr(paste(year, trim), format="%Y %q")))| year | trim | ipc | tcn | emae_sa | fecha | 
|---|---|---|---|---|---|
| 2015 | 1 | 57.6 | 8.6 | 144.2 | 2015-01-01 | 
| 2015 | 2 | 60.9 | 8.9 | 149.8 | 2015-04-01 | 
| 2015 | 3 | 64.3 | 9.2 | 150.8 | 2015-07-01 | 
| 2015 | 4 | 67.7 | 9.5 | 149.4 | 2015-10-01 | 
| 2016 | 1 | 74.7 | 13.9 | 147.8 | 2016-01-01 | 
| 2016 | 2 | 85.5 | 14.3 | 145.0 | 2016-04-01 | 
Graficos
plot(datos$fecha, datos$ipc )plot(datos$fecha, log(datos$ipc), type= 'l', col = 'red', xlab ='Mes', ylab ='Log(IPC)')GGPlot (Grammar of Graphics)
#SINTAXIS
# ggplot(data = <DATA>) + 
#   <GEOM_FUNCTION>(mapping = aes(<MAPPINGS>)) # se agregan layers (piont, line, etc.)
# aes() "aesthetic" define la estetica del grafico
library(ggplot2)
datos1 = datos %>% 
     select(fecha, ipc) 
ggplot(datos1, aes(x=fecha, y=ipc)) +
  geom_line(color = 'steelblue2', size = 1.2) +
  theme_minimal() +
  labs(title="Indice de precios al consumidor", x="Mes", y="IPC") +
  theme(legend.position="none") +
  NULLEje secundario
# El eje secundario se utiliza a escala del primario lo que no suele resultar practico
datos1 = datos %>% 
     select(fecha, ipc, tcn) %>%
     rename(IPC = ipc, TCN = tcn) %>%
     pivot_longer(cols = -fecha, names_to='Var', values_to = 'Val')| fecha | Var | Val | 
|---|---|---|
| 2015-01-01 | IPC | 57.6 | 
| 2015-01-01 | TCN | 8.6 | 
| 2015-02-01 | IPC | 58.5 | 
| 2015-02-01 | TCN | 8.7 | 
| 2015-03-01 | IPC | 59.5 | 
| 2015-03-01 | TCN | 8.8 | 
# Los graficos con eje secundario suelen presentarse en 2 graficos separados
g = ggplot(datos1) +
  geom_line(aes(x=fecha, y=Val, group=Var, color=Var), show.legend = TRUE) +
  theme_minimal() +
  theme(axis.text=element_text(size=12), axis.text.x = element_text(size = 10, angle = 90), legend.position="none") +
  labs(title="", x="", y="") +
  scale_x_date(date_breaks = "6 month", date_labels = "%Y-%m") +
  facet_wrap(~Var, scales = "free_y") +
  NULL
gGuardar un grafico en disco
work = "C:/Users/msang/OneDrive - BCRA/CursoR/2021/Clase02/"
filesave = paste0(work,'ts.png')
ggsave(filesave, g, width=10, height=8)