With reference to Challenge 3 of VAST Challenge 2022, I aim to reveal the economic of the city of Engagement, Ohio USA by using appropriate static and interactive statistical graphics methods. How does the financial health of the residents change over the period covered by the dataset? How do wages compare to the overall cost of living in Engagement? Are there groups that appear to exhibit similar patterns?
load the required libraries
understanding the financial health of the participants, we mainly refer to the financial journal table, by plotting the participants’ savings and spending in each month.
spend<-read_csv('data/FinancialJournal.csv')
## Rows: 1856330 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): category
## dbl (2): participantId, amount
## dttm (1): timestamp
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
spend<-spend%>% mutate(date= as.yearmon(spend$timestamp, "%Y %m"),year=year(spend$timestamp),mon=month(spend$timestamp))
tail(spend)
## # A tibble: 6 x 7
## participantId timestamp category amount date year mon
## <dbl> <dttm> <chr> <dbl> <yearmon> <int> <int>
## 1 782 2023-05-25 00:05:00 Recreation -22.6 May 2023 2023 5
## 2 39 2023-05-25 00:05:00 Recreation -2.76 May 2023 2023 5
## 3 28 2023-05-25 00:05:00 Recreation -29.5 May 2023 2023 5
## 4 370 2023-05-25 00:05:00 Recreation -28.4 May 2023 2023 5
## 5 537 2023-05-25 00:05:00 Food -4 May 2023 2023 5
## 6 362 2023-05-25 00:05:00 Food -4 May 2023 2023 5
spend_cat<-spend%>% group_by(category,date)%>% summarise(total=sum(amount))
## `summarise()` has grouped output by 'category'. You can override using the
## `.groups` argument.
spend_cat
## # A tibble: 77 x 3
## # Groups: category [6]
## category date total
## <chr> <yearmon> <dbl>
## 1 Education Mar 2022 -28708.
## 2 Education Apr 2022 -11423.
## 3 Education May 2022 -11423.
## 4 Education Jun 2022 -11423.
## 5 Education Jul 2022 -11423.
## 6 Education Aug 2022 -11423.
## 7 Education Sep 2022 -11423.
## 8 Education Oct 2022 -11423.
## 9 Education Nov 2022 -11423.
## 10 Education Dec 2022 -11423.
## # ... with 67 more rows
p<-ggplot(spend_cat,aes(x=date,y=total,color=category))+geom_line()
ggplotly(p)
wage<-spend_cat %>% filter(category=='Wage')
ggplot(wage, aes(x=date,y=total))+geom_line()
To inspect the spending pattern more clearly, we should plot the pattern of each kind of spending of the total income for each participant
#total income of the month
#group by participants and mon_year, extract the wage and rentagjustment
income_mon<-spend%>% group_by(participantId,date)%>% filter(category %in% c('Wage','RentAdjustment')) %>% summarize(income=sum(amount))
## `summarise()` has grouped output by 'participantId'. You can override using the
## `.groups` argument.
ggplot(data=income_mon,aes(x=date,y=income))+geom_dots()
#similarly, compute the 4 kinds of spending and merge the tables together
spend_mon<-spend%>% filter(category %in% c('Education','Food','Shelter','Recreation'))
spend_edu=spend_mon%>% group_by(participantId,date)%>% filter(category=='Education') %>% summarize(edu=sum(amount))
## `summarise()` has grouped output by 'participantId'. You can override using the
## `.groups` argument.
spend_food=spend_mon%>% group_by(participantId,date)%>% filter(category=='Food') %>% summarize(food=sum(amount))
## `summarise()` has grouped output by 'participantId'. You can override using the
## `.groups` argument.
spend_rec=spend_mon%>% group_by(participantId,date)%>% filter(category=='Recreation') %>% summarize(rec=sum(amount))
## `summarise()` has grouped output by 'participantId'. You can override using the
## `.groups` argument.
spend_shel=spend_mon%>% group_by(participantId,date)%>% filter(category=='Shelter') %>% summarize(shel=sum(amount))
## `summarise()` has grouped output by 'participantId'. You can override using the
## `.groups` argument.
spend_mon=inner_join(x=income_mon,y=spend_edu, by=c('participantId'
,'date'))
spend_mon=inner_join(x=spend_mon,y=spend_food, by=c('participantId'
,'date'))
spend_mon=inner_join(x=spend_mon,y=spend_rec, by=c('participantId'
,'date'))
spend_mon=inner_join(x=spend_mon,y=spend_shel, by=c('participantId'
,'date'))
plot the total earnings over the period of each people
spend
## # A tibble: 1,856,330 x 7
## participantId timestamp category amount date year mon
## <dbl> <dttm> <chr> <dbl> <yearmon> <int> <int>
## 1 0 2022-03-01 00:00:00 Wage 2473. Mar 2022 2022 3
## 2 0 2022-03-01 00:00:00 Shelter -555. Mar 2022 2022 3
## 3 0 2022-03-01 00:00:00 Education -38.0 Mar 2022 2022 3
## 4 1 2022-03-01 00:00:00 Wage 2047. Mar 2022 2022 3
## 5 1 2022-03-01 00:00:00 Shelter -555. Mar 2022 2022 3
## 6 1 2022-03-01 00:00:00 Education -38.0 Mar 2022 2022 3
## 7 2 2022-03-01 00:00:00 Wage 2437. Mar 2022 2022 3
## 8 2 2022-03-01 00:00:00 Shelter -557. Mar 2022 2022 3
## 9 2 2022-03-01 00:00:00 Education -12.8 Mar 2022 2022 3
## 10 3 2022-03-01 00:00:00 Wage 2367. Mar 2022 2022 3
## # ... with 1,856,320 more rows
spend_per<-spend%>% group_by(participantId)%>% summarise(earn=sum(amount))
spend_per%>% ggplot()+geom_histogram_interactive(aes(x=earn))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
spend_per
## # A tibble: 1,011 x 2
## participantId earn
## <dbl> <dbl>
## 1 0 116678.
## 2 1 97282.
## 3 2 86065.
## 4 3 81617.
## 5 4 101944.
## 6 5 7101.
## 7 6 5098.
## 8 7 58455.
## 9 8 21402.
## 10 9 167309.
## # ... with 1,001 more rows
spend_per$tooltip <- c(paste0(
"ID = ", spend_per$participantId,
"earn = ", spend_per$earn))
p <- ggplot(data=spend_per,aes(x=earn)) +
geom_dotplot_interactive(
aes(tooltip = spend_per$toolip),
stackgroups = TRUE,
binwidth = 1500,
method = "histodot") +
scale_y_continuous(NULL,
breaks = NULL)
girafe(
ggobj = p,
width_svg = 10,
height_svg = 10*0.618,
)
## Warning: Unknown or uninitialised column: `toolip`.
## Unknown or uninitialised column: `toolip`.
## Warning: Use of `spend_per$toolip` is discouraged. Use `toolip` instead.
I encountered some issue that my girafe cannot show toolip while hovered at the point
plot how many people each employers hire during the period
job<-read_csv('data/jobs.csv')
## Rows: 1328 Columns: 7
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): daysToWork, educationRequirement
## dbl (3): jobId, employerId, hourlyRate
## time (2): startTime, endTime
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(job)
## # A tibble: 6 x 7
## jobId employerId hourlyRate startTime endTime daysToWork educationRequir~
## <dbl> <dbl> <dbl> <time> <time> <chr> <chr>
## 1 0 379 10 07:46 15:46 [Monday,Tuesda~ HighSchoolOrCol~
## 2 1 379 22.2 07:31 15:31 [Monday,Tuesda~ Bachelors
## 3 2 380 10 08:00 16:00 [Monday,Tuesda~ Bachelors
## 4 3 380 15.3 07:39 15:39 [Monday,Tuesda~ Bachelors
## 5 4 381 21.4 07:53 15:53 [Monday,Tuesda~ HighSchoolOrCol~
## 6 5 381 12.1 08:13 16:13 [Monday,Sunday~ HighSchoolOrCol~
max(job$jobId)
## [1] 1327
#1327 jobs in total
p1<-job %>% group_by(employerId)%>% summarise(njob=n())%>%transform(njob=as.character(njob))%>% ggplot(aes(x=njob))+geom_bar()+ggtitle('Number of jobs posted by employers from 2021 to 2022')+
labs(y='count of employers',x='number of jobs')
ggplotly(p1)