En R existen dos tipos de bases de datos
data.frame()
ytibble()
que son las bases de datos detidyverse
el mejor paquete para manipulacion de datos.
# Un data.frame se convierte facilmente
= as_tibble(df) tib
En esta clase nos vamos a centrar en el uso de
tidyverse
ylubridate
un paquete con funciones para trabajar con fechas. El paquetezoo
tiene algunas funciones adicionales para trabajar con fechas.
Setear directorio de trabajo
getwd() # Para ver en que directorio estamos trabajando
setwd('C:/Documentos/CursoR') # Definir directorio. Notar barras invertidas en la ruta
Coloco rutas de acceso en una variable para usar mas adelante
= 'C:/Users/msang/OneDrive - BCRA/CursoR/2021/Clase01/datos_ts.xlsx'
datos_ts = 'C:/Users/msang/OneDrive - BCRA/CursoR/2021/Clase02/datos_wb.xlsx' datos_pd
Carga de datos
# CSV
= read.csv("b_datos.csv", header=TRUE, stringsAsFactors=TRUE, sep=",")
bd
# TXT / Esta es la mas eficiente porque permite paralelizar con multithread
= data.table::fread('b_datos.txt', header=TRUE, stringsAsFactors=F, sep='\t', nThread=2)
bd
= read.delim('datos/b_datos.txt', header=TRUE, stringsAsFactors=F, sep='\t') bd
# Excel
= readxl::read_excel(datos_pd, sheet='1') # En datos_pd esta la ruta de acceso completa bd
year | cname | ccode | gdp_pc2010 | gdp_pc2017 | gdp_2010 | credit_ps | inv | exports | imports | popu |
---|---|---|---|---|---|---|---|---|---|---|
2,011 | Argentina | ARG | 10,883 | 24,648 | 449,061,807,872 | 14.0 | 17.2 | 18.4 | 16.8 | 41,261,490 |
2,011 | Brazil | BRA | 11,628 | 15,323 | 2,296,661,602,092 | 58.1 | 20.6 | 11.6 | 12.4 | 197,514,541 |
2,011 | Chile | CHL | 13,456 | 22,338 | 231,892,203,586 | 101.3 | 23.1 | 37.8 | 34.4 | 17,233,584 |
2,011 | France | FRA | 41,369 | 42,864 | 2,703,188,933,543 | 96.8 | 22.4 | 28.4 | 30.4 | 65,342,789 |
2,011 | Italy | ITA | 36,228 | 42,892 | 2,151,209,302,546 | 94.1 | 19.7 | 26.9 | 28.3 | 59,379,449 |
2,011 | United Kingdom | GBR | 39,729 | 42,294 | 2,513,237,223,478 | 170.6 | 15.6 | 30.7 | 31.7 | 63,258,810 |
Bases de STATA
library(heaven)
read_dta()
write_dta()
Ingrasar datos con tidyverse
Comando | Separador |
---|---|
read_csv() |
coma |
read_csv2() |
punto y coma |
read_tsv() |
tab |
read_delim() |
otros |
Problemas con la imputacion de datos (parse)
Ver mas especificaciones aqui
# Parsear vectores
library(readr)
str(parse_logical(c("TRUE", "FALSE", "NA")))
## logi [1:3] TRUE FALSE NA
str(parse_integer(c("1", "2", "3")))
## int [1:3] 1 2 3
str(parse_date(c("2010-01-01", "1979-10-14")))
## Date[1:2], format: "2010-01-01" "1979-10-14"
parse_integer(c("1", "231", ".", "456"), na = ".")
## [1] 1 231 NA 456
parse_double("1,23", locale = locale(decimal_mark = ","))
## [1] 1.23
# Base de datos
<- read_csv(
challenge readr_example("challenge.csv"),
col_types = cols(
x = col_double(),
y = col_date()
)
)tail(challenge)
## # A tibble: 6 x 2
## x y
## <dbl> <date>
## 1 0.805 2019-11-21
## 2 0.164 2018-03-29
## 3 0.472 2014-08-04
## 4 0.718 2015-08-16
## 5 0.270 2020-02-04
## 6 0.608 2019-01-06
Output de datos
# CSV
write.csv(bd,"b_datos.csv")
write_csv()
write_excel_csv()
# TXT
write_delim()
write_tsv()
# Excel
library("xlsx")
# Primera base de datos
write.xlsx(USArrests, file = "b_datos.xlsx", sheetName = "IRIS", append = FALSE)
# Segunda base de datos
write.xlsx(mtcars, file = "b_datos.xlsx", sheetName="MTCARS", append=TRUE)
Variables
library(tidyverse)
= bd %>%
bd1 mutate(gdp_pc2010bis = gdp_2010 / popu, # crear
open = exports + imports,
inv_demean = inv - mean(inv)) %>%
rename(poblacion = popu) %>% # rename
mutate(gdp_2010 = NULL) # drop (tambien con select())
year | cname | gdp_pc2010 | gdp_pc2010bis | open | inv_demean |
---|---|---|---|---|---|
2,011 | Argentina | 10,883 | 10,883 | 35.2 | -2 |
2,011 | Brazil | 11,628 | 11,628 | 23.9 | 2 |
2,011 | Chile | 13,456 | 13,456 | 72.2 | 4 |
2,011 | France | 41,369 | 41,369 | 58.8 | 4 |
2,011 | Italy | 36,228 | 36,228 | 55.1 | 1 |
2,011 | United Kingdom | 39,729 | 39,729 | 62.4 | -3 |
Merge
= readxl::read_excel(datos_pd, sheet='2')
meta = left_join(bd, meta, by=c('ccode')) bd
year | cname | region |
---|---|---|
2011 | Argentina | Latin America & Caribbean |
2011 | Brazil | Latin America & Caribbean |
2011 | Chile | Latin America & Caribbean |
2011 | France | Europe & Central Asia |
2011 | Italy | Europe & Central Asia |
2011 | United Kingdom | Europe & Central Asia |
group_by , mutate
$id = as.numeric(factor(bd$ccode))
bd# Si quiero usar una funcion propia
= function(x) {x - mean(x, na.rm = TRUE)}
demean = bd %>%
bd mutate(open = exports + imports) %>%
select(ccode, year, region, gdp_pc2017, credit_ps, inv, id, open) %>%
arrange(ccode, year) %>%
group_by(ccode) %>%
mutate(obs = seq(1:length(ccode)), # igual con row_number()
gdp_gr = 100 * (gdp_pc2017 / lag(gdp_pc2017,1) - 1),
credit_ps_mean = mean(credit_ps, na.rm = TRUE),
dev = if_else(region=='Latin America & Caribbean', 0, 1),
gdp_dem = demean(gdp_pc2017)) %>%
ungroup()
head(bd[c('ccode', 'dev', 'year', 'gdp_pc2017', 'gdp_gr', 'gdp_dem')],10)
## # A tibble: 10 x 6
## ccode dev year gdp_pc2017 gdp_gr gdp_dem
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARG 0 2011 24648. NA 1451.
## 2 ARG 0 2012 24119. -2.15 922.
## 3 ARG 0 2013 24424. 1.27 1227.
## 4 ARG 0 2014 23550. -3.58 353.
## 5 ARG 0 2015 23934. 1.63 737.
## 6 ARG 0 2016 23190. -3.11 -7.58
## 7 ARG 0 2017 23597. 1.76 400.
## 8 ARG 0 2018 22759. -3.55 -438.
## 9 ARG 0 2019 22064. -3.06 -1133.
## 10 ARG 0 2020 19687. -10.8 -3511.
# case_when() permite evaluar mas de 2 alternativas
<- tibble(
df a = seq(1,5)
)= df %>% mutate(b = case_when(a <= 2 ~ 1,
df > 2 & a <= 4 ~ 2,
a TRUE ~ as.double(a)))
df
## # A tibble: 5 x 2
## a b
## <int> <dbl>
## 1 1 1
## 2 2 1
## 3 3 2
## 4 4 2
## 5 5 5
Guardar una base de datos en R
= bd %>% select(ccode, year, region, gdp_gr, credit_ps, inv, open)
bd save(bd, file="datos_wb.rda")
ccode | year | region | gdp_gr | credit_ps | inv | open |
---|---|---|---|---|---|---|
ARG | 2018 | Latin America & Caribbean | -3.6 | NA | 14.7 | 31.2 |
ARG | 2019 | Latin America & Caribbean | -3.1 | NA | 13.5 | 32.6 |
ARG | 2020 | Latin America & Caribbean | -10.8 | NA | 13.4 | 30.5 |
BRA | 2018 | Latin America & Caribbean | 1.0 | 60.2 | 15.1 | 28.9 |
BRA | 2019 | Latin America & Caribbean | 0.7 | 62.6 | 15.3 | 28.5 |
BRA | 2020 | Latin America & Caribbean | -4.7 | 70.2 | 16.4 | 32.4 |
GBR | 2018 | Europe & Central Asia | 0.6 | 134.6 | 17.8 | 63.0 |
GBR | 2019 | Europe & Central Asia | 0.8 | 133.5 | 18.0 | 63.4 |
GBR | 2020 | Europe & Central Asia | -10.3 | 146.4 | 17.6 | 55.1 |
ITA | 2018 | Europe & Central Asia | 1.1 | 76.7 | 17.8 | 60.3 |
ITA | 2019 | Europe & Central Asia | 1.5 | 74.3 | 18.0 | 60.1 |
ITA | 2020 | Europe & Central Asia | -8.6 | 83.6 | 17.8 | 55.3 |
Loop sobre variables
set.seed(1234)
<- tibble(
df a = runif(100, min=0, max=100),
b = rnorm(100, 0, 1),
c = rnorm(100, mean=5, sd=3),
)head(df,3)
## # A tibble: 3 x 3
## a b c
## <dbl> <dbl> <dbl>
## 1 11.4 -1.81 3.87
## 2 62.2 -0.582 5.29
## 3 60.9 -1.11 9.92
# Reemplazar variables existentes
= names(df)
vars for (v in vars) {
= df[v] * 100
df[v]
}
head(df,3)
## # A tibble: 3 x 3
## a b c
## <dbl> <dbl> <dbl>
## 1 1137. -181. 387.
## 2 6223. -58.2 529.
## 3 6093. -111. 992.
# Generar varaibles nuevas
# Dividir las dos ultimas (b y c) por la primera (a)
set.seed(1234)
<- tibble(
df1 a = rep(2, 100),
b = rnorm(100, 0, 1),
c = rnorm(100, 5, 3),
)= names(df1[2:length(df1)])
vars for (v in vars) {
paste0(v,'_a')] = df1[v] / df1[['a']]
df1[
}head(df1,3)
## # A tibble: 3 x 5
## a b c b_a c_a
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 -1.21 6.24 -0.604 3.12
## 2 2 0.277 3.58 0.139 1.79
## 3 2 1.08 5.20 0.542 2.60
Valores missing
# Son tratados como los valores mas grandes de todos pero el replace los respeta
<- tibble(
df2 a = sample(1:5, 5, replace = F),
b = seq(5,1),
) df2
## # A tibble: 5 x 2
## a b
## <int> <int>
## 1 1 5
## 2 5 4
## 3 3 3
## 4 4 2
## 5 2 1
= df2 %>% mutate(b = ifelse(b == 2, NA, b))
df2 = df2 %>% mutate(c = ifelse(b > 3, 0, b))
df2 = df2 %>% arrange(b)
df2 df2
## # A tibble: 5 x 3
## a b c
## <int> <int> <dbl>
## 1 2 1 1
## 2 3 3 3
## 3 5 4 0
## 4 1 5 0
## 5 4 NA NA
Volvemos a la base de WB
# Recordamos la estructura
str(bd)
## tibble [60 x 7] (S3: tbl_df/tbl/data.frame)
## $ ccode : chr [1:60] "ARG" "ARG" "ARG" "ARG" ...
## $ year : num [1:60] 2011 2012 2013 2014 2015 ...
## $ region : chr [1:60] "Latin America & Caribbean" "Latin America & Caribbean" "Latin America & Caribbean" "Latin America & Caribbean" ...
## $ gdp_gr : num [1:60] NA -2.15 1.27 -3.58 1.63 ...
## $ credit_ps: num [1:60] 14 15.2 15.7 13.8 14.4 ...
## $ inv : num [1:60] 17.2 15.9 16.3 16 15.6 ...
## $ open : num [1:60] 35.2 30.5 29.3 28.4 22.5 ...
summary(bd[,1:4])
## ccode year region gdp_gr
## Length:60 Min. :2011 Length:60 Min. :-10.7750
## Class :character 1st Qu.:2013 Class :character 1st Qu.: -2.7656
## Mode :character Median :2016 Mode :character Median : 0.7100
## Mean :2016 Mean : -0.6777
## 3rd Qu.:2018 3rd Qu.: 1.4095
## Max. :2020 Max. : 4.3092
## NA's :6
Eliminar valores missing (elimina la fila completa)
= na.omit(bd)
bd1 nrow(bd1)
## [1] 50
sum(ifelse(is.na(bd$credit_ps),1,0)) # cuenta valores missing de CreditoSPriv
## [1] 4
rm('bd1')
Pivot (reshape)
# Long
= bd %>%
bdAR filter(ccode=='ARG') %>%
select(year, credit_ps, inv) %>%
pivot_longer(cols=-year, names_to="Var", values_to="Val") %>%
arrange(year, desc(Var)) # Sort
year | Var | Val |
---|---|---|
2011 | inv | 17.2 |
2011 | credit_ps | 14.0 |
2012 | inv | 15.9 |
2012 | credit_ps | 15.2 |
2013 | inv | 16.3 |
2013 | credit_ps | 15.7 |
# Wide
= bdAR %>%
bdAR2 filter(Var=='credit_ps', year <=2015) %>%
mutate(ccode = 'ARG') %>%
pivot_wider(id_cols=ccode, names_from=year, values_from=Val) %>%
rename_with(~ paste0("CREDIT", 2011:2015), where(is.numeric))
ccode | CREDIT2011 | CREDIT2012 | CREDIT2013 | CREDIT2014 | CREDIT2015 |
---|---|---|---|---|---|
ARG | 14 | 15.2 | 15.7 | 13.8 | 14.4 |
# Otra forma de llevar a long starts_with()
= bdAR2 %>%
bdAR3 pivot_longer(cols = starts_with("CREDIT"), names_to="Var", values_to="Val") %>%
separate(Var, c("V","year"), sep = 6)
#unite() para concatenar variables
ccode | V | year | Val |
---|---|---|---|
ARG | CREDIT | 2011 | 14.0 |
ARG | CREDIT | 2012 | 15.2 |
ARG | CREDIT | 2013 | 15.7 |
ARG | CREDIT | 2014 | 13.8 |
ARG | CREDIT | 2015 | 14.4 |
Append
= bd %>%
bdAR filter(ccode=="ARG",
>=2018) %>%
yearselect(year, ccode, gdp_gr)
= bd %>%
bdBR filter(ccode=="BRA",
>=2018) %>%
yearselect(year, ccode, gdp_gr)
= rbind(bdAR, bdBR)
bdARBR # Ver bind_rows() de tidyverse
year | ccode | gdp_gr |
---|---|---|
2018 | ARG | -3.6 |
2019 | ARG | -3.1 |
2020 | ARG | -10.8 |
2018 | BRA | 1.0 |
2019 | BRA | 0.7 |
2020 | BRA | -4.7 |
Trabajar con strings
= c('uno','dos', 'tres')
words length(words) # cuenta los elementos del vector
## [1] 3
str_length(words)
## [1] 3 3 4
= gsub('tres','cinco',words)
words words
## [1] "uno" "dos" "cinco"
= str_replace(words, "uno", "diez")
words words
## [1] "diez" "dos" "cinco"
= 'hola'
hi nchar(hi)
## [1] 4
Trabajar con fechas
Year
%Y (4 digitos).
%y (2 digitos); 00-69 -> 2000-2069, 70-99 -> 1970-1999.
Month
%m (2 digitos).
%b (nombre abreviado, “Jan”).
%B (nombre completo, “January”).
Day
%d (2 digitos).
library(zoo)
library(lubridate)
= readxl::read_excel(datos_ts, sheet='datos')
datos $fecha = as.Date(datos$fecha, format = '%Y-%m-%d')
datos$fecha2 = datos$fecha + days(15)
datos$fecha3 = floor_date(datos$fecha2, 'month')
datos$mes = month(datos$fecha)
datos$year = year(datos$fecha)
datos= datos[,c(1,5,6,7,8,2,3,4)] # reordena la base de datos datos
fecha | …5 | fecha2 | fecha3 | mes | ipc | tcn | emae_sa |
---|---|---|---|---|---|---|---|
2015-01-01 | 2015q1 | 2015-01-16 | 2015-01-01 | 1 | 57.6 | 8.6 | 144.2 |
2015-02-01 | 2015q1 | 2015-02-16 | 2015-02-01 | 2 | 58.5 | 8.7 | 148.0 |
2015-03-01 | 2015q1 | 2015-03-16 | 2015-03-01 | 3 | 59.5 | 8.8 | 147.7 |
2015-04-01 | 2015q2 | 2015-04-16 | 2015-04-01 | 4 | 60.9 | 8.9 | 149.8 |
2015-05-01 | 2015q2 | 2015-05-16 | 2015-05-01 | 5 | 62.2 | 9.0 | 149.7 |
2015-06-01 | 2015q2 | 2015-06-16 | 2015-06-01 | 6 | 63.0 | 9.1 | 150.6 |
Datos trimestrales (zoo
)
<- readxl::read_excel(datos_ts, sheet='trim') %>%
datosq mutate(fecha=as.Date(as.yearqtr(paste(year, trim), format="%Y %q")))
year | trim | ipc | tcn | emae_sa | fecha |
---|---|---|---|---|---|
2015 | 1 | 57.6 | 8.6 | 144.2 | 2015-01-01 |
2015 | 2 | 60.9 | 8.9 | 149.8 | 2015-04-01 |
2015 | 3 | 64.3 | 9.2 | 150.8 | 2015-07-01 |
2015 | 4 | 67.7 | 9.5 | 149.4 | 2015-10-01 |
2016 | 1 | 74.7 | 13.9 | 147.8 | 2016-01-01 |
2016 | 2 | 85.5 | 14.3 | 145.0 | 2016-04-01 |
Graficos
plot(datos$fecha, datos$ipc )
plot(datos$fecha, log(datos$ipc), type= 'l', col = 'red', xlab ='Mes', ylab ='Log(IPC)')
GGPlot (Grammar of Graphics)
#SINTAXIS
# ggplot(data = <DATA>) +
# <GEOM_FUNCTION>(mapping = aes(<MAPPINGS>)) # se agregan layers (piont, line, etc.)
# aes() "aesthetic" define la estetica del grafico
library(ggplot2)
= datos %>%
datos1 select(fecha, ipc)
ggplot(datos1, aes(x=fecha, y=ipc)) +
geom_line(color = 'steelblue2', size = 1.2) +
theme_minimal() +
labs(title="Indice de precios al consumidor", x="Mes", y="IPC") +
theme(legend.position="none") +
NULL
Eje secundario
# El eje secundario se utiliza a escala del primario lo que no suele resultar practico
= datos %>%
datos1 select(fecha, ipc, tcn) %>%
rename(IPC = ipc, TCN = tcn) %>%
pivot_longer(cols = -fecha, names_to='Var', values_to = 'Val')
fecha | Var | Val |
---|---|---|
2015-01-01 | IPC | 57.6 |
2015-01-01 | TCN | 8.6 |
2015-02-01 | IPC | 58.5 |
2015-02-01 | TCN | 8.7 |
2015-03-01 | IPC | 59.5 |
2015-03-01 | TCN | 8.8 |
# Los graficos con eje secundario suelen presentarse en 2 graficos separados
= ggplot(datos1) +
g geom_line(aes(x=fecha, y=Val, group=Var, color=Var), show.legend = TRUE) +
theme_minimal() +
theme(axis.text=element_text(size=12), axis.text.x = element_text(size = 10, angle = 90), legend.position="none") +
labs(title="", x="", y="") +
scale_x_date(date_breaks = "6 month", date_labels = "%Y-%m") +
facet_wrap(~Var, scales = "free_y") +
NULL
g
Guardar un grafico en disco
= "C:/Users/msang/OneDrive - BCRA/CursoR/2021/Clase02/"
work = paste0(work,'ts.png')
filesave ggsave(filesave, g, width=10, height=8)