Introduction

With reference to Challenge 3 of VAST Challenge 2022, I aim to reveal the economic of the city of Engagement, Ohio USA by using appropriate static and interactive statistical graphics methods. How does the financial health of the residents change over the period covered by the dataset? How do wages compare to the overall cost of living in Engagement? Are there groups that appear to exhibit similar patterns?

 load the required libraries

 understanding the financial health of the participants, we mainly refer to the financial journal table, by plotting the participants’ savings and spending in each month.

spend<-read_csv('data/FinancialJournal.csv')
## Rows: 1856330 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (1): category
## dbl  (2): participantId, amount
## dttm (1): timestamp
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
spend<-spend%>% mutate(date= as.yearmon(spend$timestamp, "%Y %m"),year=year(spend$timestamp),mon=month(spend$timestamp))
tail(spend)
## # A tibble: 6 x 7
##   participantId timestamp           category   amount date       year   mon
##           <dbl> <dttm>              <chr>       <dbl> <yearmon> <int> <int>
## 1           782 2023-05-25 00:05:00 Recreation -22.6  May 2023   2023     5
## 2            39 2023-05-25 00:05:00 Recreation  -2.76 May 2023   2023     5
## 3            28 2023-05-25 00:05:00 Recreation -29.5  May 2023   2023     5
## 4           370 2023-05-25 00:05:00 Recreation -28.4  May 2023   2023     5
## 5           537 2023-05-25 00:05:00 Food        -4    May 2023   2023     5
## 6           362 2023-05-25 00:05:00 Food        -4    May 2023   2023     5
spend_cat<-spend%>% group_by(category,date)%>% summarise(total=sum(amount))
## `summarise()` has grouped output by 'category'. You can override using the
## `.groups` argument.
spend_cat
## # A tibble: 77 x 3
## # Groups:   category [6]
##    category  date        total
##    <chr>     <yearmon>   <dbl>
##  1 Education Mar 2022  -28708.
##  2 Education Apr 2022  -11423.
##  3 Education May 2022  -11423.
##  4 Education Jun 2022  -11423.
##  5 Education Jul 2022  -11423.
##  6 Education Aug 2022  -11423.
##  7 Education Sep 2022  -11423.
##  8 Education Oct 2022  -11423.
##  9 Education Nov 2022  -11423.
## 10 Education Dec 2022  -11423.
## # ... with 67 more rows
p<-ggplot(spend_cat,aes(x=date,y=total,color=category))+geom_line()
ggplotly(p)
Mar 2022May 2022Aug 2022Oct 2022Jan 2023Mar 2023May 20230e+002e+064e+066e+06
categoryEducationFoodRecreationRentAdjustmentShelterWagedatetotal
wage<-spend_cat %>% filter(category=='Wage')
ggplot(wage, aes(x=date,y=total))+geom_line()

 To inspect the spending pattern more clearly, we should plot the pattern of each kind of spending of the total income for each participant

#total income of the month
#group by participants and mon_year, extract the wage and rentagjustment
income_mon<-spend%>% group_by(participantId,date)%>% filter(category %in% c('Wage','RentAdjustment')) %>% summarize(income=sum(amount))
## `summarise()` has grouped output by 'participantId'. You can override using the
## `.groups` argument.
ggplot(data=income_mon,aes(x=date,y=income))+geom_dots()

#similarly, compute the 4 kinds of spending and merge the tables together
spend_mon<-spend%>% filter(category %in% c('Education','Food','Shelter','Recreation'))
spend_edu=spend_mon%>% group_by(participantId,date)%>% filter(category=='Education') %>% summarize(edu=sum(amount))
## `summarise()` has grouped output by 'participantId'. You can override using the
## `.groups` argument.
spend_food=spend_mon%>% group_by(participantId,date)%>% filter(category=='Food') %>% summarize(food=sum(amount))
## `summarise()` has grouped output by 'participantId'. You can override using the
## `.groups` argument.
spend_rec=spend_mon%>% group_by(participantId,date)%>% filter(category=='Recreation') %>% summarize(rec=sum(amount))
## `summarise()` has grouped output by 'participantId'. You can override using the
## `.groups` argument.
spend_shel=spend_mon%>% group_by(participantId,date)%>% filter(category=='Shelter') %>% summarize(shel=sum(amount))
## `summarise()` has grouped output by 'participantId'. You can override using the
## `.groups` argument.
spend_mon=inner_join(x=income_mon,y=spend_edu, by=c('participantId'
,'date'))
spend_mon=inner_join(x=spend_mon,y=spend_food, by=c('participantId'
,'date'))
spend_mon=inner_join(x=spend_mon,y=spend_rec, by=c('participantId'
,'date'))
spend_mon=inner_join(x=spend_mon,y=spend_shel, by=c('participantId'
,'date'))

 plot the total earnings over the period of each people

spend
## # A tibble: 1,856,330 x 7
##    participantId timestamp           category  amount date       year   mon
##            <dbl> <dttm>              <chr>      <dbl> <yearmon> <int> <int>
##  1             0 2022-03-01 00:00:00 Wage      2473.  Mar 2022   2022     3
##  2             0 2022-03-01 00:00:00 Shelter   -555.  Mar 2022   2022     3
##  3             0 2022-03-01 00:00:00 Education  -38.0 Mar 2022   2022     3
##  4             1 2022-03-01 00:00:00 Wage      2047.  Mar 2022   2022     3
##  5             1 2022-03-01 00:00:00 Shelter   -555.  Mar 2022   2022     3
##  6             1 2022-03-01 00:00:00 Education  -38.0 Mar 2022   2022     3
##  7             2 2022-03-01 00:00:00 Wage      2437.  Mar 2022   2022     3
##  8             2 2022-03-01 00:00:00 Shelter   -557.  Mar 2022   2022     3
##  9             2 2022-03-01 00:00:00 Education  -12.8 Mar 2022   2022     3
## 10             3 2022-03-01 00:00:00 Wage      2367.  Mar 2022   2022     3
## # ... with 1,856,320 more rows
spend_per<-spend%>% group_by(participantId)%>% summarise(earn=sum(amount))
spend_per%>% ggplot()+geom_histogram_interactive(aes(x=earn))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

spend_per
## # A tibble: 1,011 x 2
##    participantId    earn
##            <dbl>   <dbl>
##  1             0 116678.
##  2             1  97282.
##  3             2  86065.
##  4             3  81617.
##  5             4 101944.
##  6             5   7101.
##  7             6   5098.
##  8             7  58455.
##  9             8  21402.
## 10             9 167309.
## # ... with 1,001 more rows
spend_per$tooltip <- c(paste0(
  "ID = ", spend_per$participantId,
  "earn = ", spend_per$earn))
p <- ggplot(data=spend_per,aes(x=earn)) +
  geom_dotplot_interactive(
    aes(tooltip = spend_per$toolip),
    stackgroups = TRUE, 
    binwidth = 1500, 
    method = "histodot") +
  scale_y_continuous(NULL, 
                     breaks = NULL)
girafe(
  ggobj = p,
  width_svg = 10,
  height_svg = 10*0.618,

)
## Warning: Unknown or uninitialised column: `toolip`.
## Unknown or uninitialised column: `toolip`.
## Warning: Use of `spend_per$toolip` is discouraged. Use `toolip` instead.
0 50000 100000 150000 200000 250000 earn

 I encountered some issue that my girafe cannot show toolip while hovered at the point

 plot how many people each employers hire during the period

job<-read_csv('data/jobs.csv')
## Rows: 1328 Columns: 7
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (2): daysToWork, educationRequirement
## dbl  (3): jobId, employerId, hourlyRate
## time (2): startTime, endTime
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(job)
## # A tibble: 6 x 7
##   jobId employerId hourlyRate startTime endTime daysToWork      educationRequir~
##   <dbl>      <dbl>      <dbl> <time>    <time>  <chr>           <chr>           
## 1     0        379       10   07:46     15:46   [Monday,Tuesda~ HighSchoolOrCol~
## 2     1        379       22.2 07:31     15:31   [Monday,Tuesda~ Bachelors       
## 3     2        380       10   08:00     16:00   [Monday,Tuesda~ Bachelors       
## 4     3        380       15.3 07:39     15:39   [Monday,Tuesda~ Bachelors       
## 5     4        381       21.4 07:53     15:53   [Monday,Tuesda~ HighSchoolOrCol~
## 6     5        381       12.1 08:13     16:13   [Monday,Sunday~ HighSchoolOrCol~
max(job$jobId)
## [1] 1327
#1327 jobs in total
p1<-job %>% group_by(employerId)%>% summarise(njob=n())%>%transform(njob=as.character(njob))%>% ggplot(aes(x=njob))+geom_bar()+ggtitle('Number of jobs posted by employers from 2021 to 2022')+
  labs(y='count of employers',x='number of jobs')
ggplotly(p1)
234567890102030
Number of jobs posted by employers from 2021 to 2022number of jobscount of employers