Planning and Forecasting for El Paso’s Bus Transit Alternatives

Charlie Huemmler, Yingxue Ou, Jack Rummler

2023-05-04

Introduction

df <- riderstops %>%
  na.omit(riderstops[, c("stop_lat", "stop_lon")]) %>% 
  st_as_sf(coords = c("stop_lon", "stop_lat"), crs = 4269)

df$longitude <- st_coordinates(df$geometry)[, "X"]
df$latitude <- st_coordinates(df$geometry)[, "Y"]

df$RT <- as.numeric(df$RT)

df <- df %>% # CLIENT DATA
  mutate(
    type = case_when(
      RT == 2 ~ "Local",
      RT == 7 ~ "Local",
      RT == 10 ~ "Local",
      RT == 14 ~ "Local",
      RT == 15 ~ "Local",
      RT == 24 ~ "Local",
      RT == 25 ~ "Local",
      RT == 32 ~ "Local",
      RT == 33 ~ "Local",
      RT == 34 ~ "Local",
      RT == 35 ~ "Local",
      RT == 36 ~ "Local",
      RT == 37 ~ "Local",
      RT == 50 ~ "Local",
      RT == 51 ~ "Local",
      RT == 52 ~ "Local",
      RT == 53 ~ "Local",
      RT == 54 ~ "Local",
      RT == 58 ~ "Local",
      RT == 61 ~ "Local",
      RT == 62 ~ "Local",
      RT == 63 ~ "Local",
      RT == 64 ~ "Local",
      RT == 65 ~ "Local",
      RT == 66 ~ "Local",
      RT == 67 ~ "Local",
      RT == 68 ~ "Local",
      RT == 69 ~ "Local",
      RT == 72 ~ "Local",
      RT == 74 ~ "Local",
      RT == 86 ~ "Local",
      RT == 4 ~ "Circulator",
      RT == 8 ~ "Circulator",
      RT == 5 ~ "Express",
      RT == 6 ~ "Express",
      RT == 26 ~ "Express",
      RT == 59 ~ "Express",
      RT == 76 ~ "Express",
      RT == 11 ~ "Feeder",
      RT == 12 ~ "Feeder",
      RT == 13 ~ "Feeder",
      RT == 16 ~ "Feeder",
      RT == 19 ~ "Feeder",
      RT == 43 ~ "Feeder",
      RT == 44 ~ "Feeder",
      RT == 46 ~ "Feeder",
      RT == 56 ~ "Feeder",
      RT == 60 ~ "Feeder",
      RT == 84 ~ "Feeder",
      RT == 89 ~ "Feeder",
      RT == 205 ~ "BRIO",
      RT == 206 ~ "BRIO",
      RT == 207 ~ "BRIO",
      RT == 208 ~ "BRIO",
      RT == 17 ~ "Other",
      RT == 20 ~ "Other",
      RT == 21 ~ "Circulator",
      RT == 41 ~ "Other",
      RT == 82 ~ "Other",
      RT == 87 ~ "Other",
      RT == 500 ~ "Streetcar"
    )
  )

df <- df %>% # Removes tainted data
  dplyr::filter(type != "Other")

classified <- bus_routes %>% # OPEN DATA EL PASO DATA
    mutate(
    type = case_when(
      ROUTE == 2 ~ "Local",
      ROUTE == 7 ~ "Local",
      ROUTE == 10 ~ "Local",
      ROUTE == 14 ~ "Local",
      ROUTE == 15 ~ "Local",
      ROUTE == 24 ~ "Local",
      ROUTE == 25 ~ "Local",
      ROUTE == 32 ~ "Local",
      ROUTE == 33 ~ "Local",
      ROUTE == 34 ~ "Local",
      ROUTE == 35 ~ "Local",
      ROUTE == 36 ~ "Local",
      ROUTE == 37 ~ "Local",
      ROUTE == 50 ~ "Local",
      ROUTE == 51 ~ "Local",
      ROUTE == 52 ~ "Local",
      ROUTE == 53 ~ "Local",
      ROUTE == 54 ~ "Local",
      ROUTE == 58 ~ "Local",
      ROUTE == 61 ~ "Local",
      ROUTE == 62 ~ "Local",
      ROUTE == 63 ~ "Local",
      ROUTE == 64 ~ "Local",
      ROUTE == 65 ~ "Local",
      ROUTE == 66 ~ "Local",
      ROUTE == 67 ~ "Local",
      ROUTE == 68 ~ "Local",
      ROUTE == 69 ~ "Local",
      ROUTE == 72 ~ "Local",
      ROUTE == 74 ~ "Local",
      ROUTE == 86 ~ "Local",
      ROUTE == 4 ~ "Circulator",
      ROUTE == 8 ~ "Circulator",
      ROUTE == 5 ~ "Express",
      ROUTE == 6 ~ "Express",
      ROUTE == 26 ~ "Express",
      ROUTE == 59 ~ "Express",
      ROUTE == 76 ~ "Express",
      ROUTE == 11 ~ "Feeder",
      ROUTE == 12 ~ "Feeder",
      ROUTE == 13 ~ "Feeder",
      ROUTE == 16 ~ "Feeder",
      ROUTE == 19 ~ "Feeder",
      ROUTE == 43 ~ "Feeder",
      ROUTE == 44 ~ "Feeder",
      ROUTE == 46 ~ "Feeder",
      ROUTE == 56 ~ "Feeder",
      ROUTE == 60 ~ "Feeder",
      ROUTE == 84 ~ "Feeder",
      ROUTE == 89 ~ "Feeder",
      ROUTE == 205 ~ "BRIO",
      ROUTE == 206 ~ "BRIO",
      ROUTE == 207 ~ "BRIO",
      ROUTE == 208 ~ "BRIO",
      ROUTE == 17 ~ "Other",
      ROUTE == 20 ~ "Other",
      ROUTE == 21 ~ "Circulator",
      ROUTE == 41 ~ "Other",
      ROUTE == 82 ~ "Other",
      ROUTE == 87 ~ "Other",
      ROUTE == 500 ~ "Streetcar"
    )
  )

classified <- na.omit(classified)

local <- classified[classified$type %in% "Local", ]

time <- df %>%
  dplyr::filter(type %in% "Local") %>%
  group_by(Date) %>%
  summarize(ridership = sum(Ons) + sum(Offs))


# Aggregating at route level for average and total ridership 
RT_avg_sum <- df %>%
  group_by(RT) %>%
  summarize(avg_ons = mean(Ons), 
            avg_offs = mean(Offs),
            total_ons = sum(Ons),
            total_offs = sum(Offs)) %>%
  mutate(total_ridership = total_ons + total_offs) %>%
  mutate(
    type = case_when(
            RT == 2 ~ "Local",
      RT == 7 ~ "Local",
      RT == 10 ~ "Local",
      RT == 14 ~ "Local",
      RT == 15 ~ "Local",
      RT == 24 ~ "Local",
      RT == 25 ~ "Local",
      RT == 32 ~ "Local",
      RT == 33 ~ "Local",
      RT == 34 ~ "Local",
      RT == 35 ~ "Local",
      RT == 36 ~ "Local",
      RT == 37 ~ "Local",
      RT == 50 ~ "Local",
      RT == 51 ~ "Local",
      RT == 52 ~ "Local",
      RT == 53 ~ "Local",
      RT == 54 ~ "Local",
      RT == 58 ~ "Local",
      RT == 61 ~ "Local",
      RT == 62 ~ "Local",
      RT == 63 ~ "Local",
      RT == 64 ~ "Local",
      RT == 65 ~ "Local",
      RT == 66 ~ "Local",
      RT == 67 ~ "Local",
      RT == 68 ~ "Local",
      RT == 69 ~ "Local",
      RT == 72 ~ "Local",
      RT == 74 ~ "Local",
      RT == 86 ~ "Local",
      RT == 4 ~ "Circulator",
      RT == 8 ~ "Circulator",
      RT == 5 ~ "Express",
      RT == 6 ~ "Express",
      RT == 26 ~ "Express",
      RT == 59 ~ "Express",
      RT == 76 ~ "Express",
      RT == 11 ~ "Feeder",
      RT == 12 ~ "Feeder",
      RT == 13 ~ "Feeder",
      RT == 16 ~ "Feeder",
      RT == 19 ~ "Feeder",
      RT == 43 ~ "Feeder",
      RT == 44 ~ "Feeder",
      RT == 46 ~ "Feeder",
      RT == 56 ~ "Feeder",
      RT == 60 ~ "Feeder",
      RT == 84 ~ "Feeder",
      RT == 89 ~ "Feeder",
      RT == 205 ~ "BRIO",
      RT == 206 ~ "BRIO",
      RT == 207 ~ "BRIO",
      RT == 208 ~ "BRIO",
      RT == 17 ~ "Other",
      RT == 20 ~ "Other",
      RT == 21 ~ "Circulator",
      RT == 41 ~ "Other",
      RT == 82 ~ "Other",
      RT == 87 ~ "Other",
      RT == 500 ~ "Streetcar"
    )
  )

RT_avg_sum_st <- st_drop_geometry(RT_avg_sum) # no geom

# Ridership at strictly route level
RT_stop_avg <- df %>%
  group_by(RT, TP, longitude, latitude) %>%
  summarize(avg_ons = mean(Ons), 
            avg_offs = mean(Offs),
            total_ons = sum(Ons),
            total_offs = sum(Offs)) %>%
  mutate(total_ridership = (total_ons + total_offs))

RT_stop_avg_st <- st_drop_geometry(RT_stop_avg) # no geom

Acknowledgement

This project was created by Charlie Huemmler, Yingxue Ou, and Jack Rummler as part of the University of Pennsylvania’s Master of Urban Spatial Analytics and Smart Cities practicum (MUSA 801). We would like to thank our client, Alex Hoffman, AICP, of the El Paso Capital Improvements Department for his continual knowledge, eagerness to help, and enthusiasm for this project. We also would like to thank Professors Matthew Harris, Michael Fichman, and Mjumbe Poe for their invaluable assistance and mentorship throughout this process.

To check out other practicum projects for the MUSA/Smart Cities practicum, click here.
To check out our web application, click here.
To check our Github Pages, click here.

Introduction and Use Case

Image Credit: KTSM 9 News

“El Paso, often forgotten, is building a stronger transit network than most US cities.”
Christof Spieler (Trains, Buses, People)

Transit ridership is often a key metric to assess the efficiency of public transportation systems. Like many transit agencies in the last decade, there has been an annual decrease in transit ridership, with the sharpest decline in ridership at the beginning of the COVID-19 pandemic. While many transit agencies are finally beginning to see comparable transit ridership now as compared to pre-2020, transit agencies are still grappling with how to maintain existing riders and lure new riders through trustworthy and interconnected service.

Sun Metro Mass Transit Department, referred to as Sun Metro, is the transportation provider in El Paso, Texas and is a department of the city of El Paso. Sun Metro has experienced a similar nationwide trend of declining ridership for several years prior to the pandemic and a sharp decrease in ridership and service during and post-pandemic. Moreover, Sun Metro has made several expansions to its transit service in the past decade, which include but are not limited to:

Bus Rapid Transit (BRIO): Introduced four BRIO lines between 2014 and 2022, which experience the agency’s highest ridership and most frequent service.
Streetcar Network: Re-ignited the streetcar network in 2018 using restored PCC streetcars, providing more circuitous service in downtown, uptown, and University of Texas at El Paso.
El Paso Upper East Side Transit Center: New transit center connected to the Five Points Transit Center, the Montana BRIO line, and more park-and-ride services.

In 2022, Sun Metro experienced about 65% of pre-pandemic ridership numbers, but the agency is looking to explore the implications of new bus transit alternative scenarios given its service expansions and future growth trajectory. Particularly, it is looking at ways to maximize equity and accessibility in addition to fare revenue optimization.

The balance between equity and revenue can often be a challenge for transit agencies to balance. While agencies want to provide access to transit for all of its residents, and particularly those who may be transit-dependent for all of their trips, this means placing bus stops in under-resourced neighborhoods that may not be profitable for the agency. Our client revealed that the El Paso Capital Improvements Department is looking to invest a certain percentage of annual funds toward under-resourced areas.

Our team defined equity optimization as balancing the service distribution among all populations regardless of race, socioeconomic status, ability, and other demographic variables. We also defined fare revenue optimization as maximizing total ridership to ensure routes have profitability for Sun Metro.

Methodology

For this project, we are building a latent bus transit demand model. We are supplied ridership data for the year 2022, which contains number of on-boardings and off-boardings per route, stop, and day. We built a model to predict bus transit ridership training our model on an 80% subset of our data and testing it on 20% of our data to receive predicted ridership.

Our data is aggregated to hexbins, which are equally shaped hexagons that act as an overlay on top of the city of El Paso boundary. Total ridership is aggregated into each hexbin to predict what the total ridership is based on the features engineered into our model. We built two models: a Random Forest model and an XGBoost model, but ultimately used the XGBoost model.

We are building this model to inform our client’s interest in knowing how bus transit alternative routes will affect route-level equity and fare revenue implications. In our model, we bake in equity metrics, which highlight the ability for all residents, and especially transit-dependent populations, to have operable, efficient service, in addition to the accessibility of stops for all El Paso residents. Fare revenue optimization is derived from ridership. A Sun Metro planner could look at both current routes to see if our predicted ridership is higher or lower than current ridership to understand if there is increased demand for bus transit, or submit new routes to understand route level predicted ridership and how it compares to existing routes predicted ridership. If our model can accurately predict bus transit ridership, it can inform how new transit routes perform on these indicators, which the web application we develop will evaluate.

Thus, this project is two-fold: a predictive bus transit ridership model and a performance-based measurement evaluating the equity and financial implications of transit demand.

Latent bus transit demand model: We are predicting bus transit ridership based on 2022 ridership to model demand for transit services in the future. We are building correlative features into our model that are predictive of ridership while also maintaining the equity indicators essential to city-wide accessibility and efficiency of bus transit.
Informational network web app: A web app where a Sun Metro planner can input their own .geojson file of a new transit line and assess it on fare revenue (ridership) and equity (different independent variables assessing demographic distributions) optimization.

Exploratory Data Analysis

In this section, we look at both the context area of El Paso and how it relates to transit ridership, in addition to the data given to us by our client.

El Paso Context: El Paso has a unique positionality and geography that makes roadway infrastructure complex, inherently impacting the structure of bus routes.
Ridership Data: The El Paso Capital Improvements Department supplied us with ridership data for the year 2022. This data was collected at every route and stop in the transit network by collecting onboarding and offboarding data using bus sensors. Note that only about 85% of buses had these sensors at any given time, thus there is some inherent bias in the data; however, it does still paint a high level picture of ridership patterns in the city.

El Paso Context

El Paso is the sixth-largest city in Texas located at the far western tip of the state, just immediately north of Ciudad Juárez in Mexico. The city is bounded by the Rio Grande River to the south, the Franklin Mountain Range to the north, and Fort Bliss Military Base to the northeast. The context map below shows the geographical constraints where the Franklin Mountains are represented in purple, Fort Bliss Military Base in green, and the Rio Grande River in light blue, overlain on top of population density of the city at a census tract level. As we can see, downtown El Paso is notched in between the mountains and river, with heavy sprawl toward the north and east.

el_paso <- el_paso %>% st_transform(crs = 4269)
ep_bus_routes <- st_intersection(el_paso, bus_routes)

grid.arrange(ncol=2,
ggplot() +
  geom_sf(data=ep_co, fill=palette6[2], color=palette6[2]) +
  geom_sf(data=el_paso, fill="grey", color='grey') +
  geom_sf(data=rio, color="blue", fill='blue', size = 2) +
  geom_sf(data=franklin, fill=palette6[6], color=palette6[6]) +
  geom_sf(data=ft_bliss, fill=palette6[4], color=palette6[4]) +
  geom_sf(data=bus_routes, color="black") +
  labs(caption="El Paso (county) in pink\nEl Paso (city) in grey\nRio Grande River in blue\nFranklin Mountains State Park in purple\nFort Bliss Military Base in olive\nCurrent bus network in black")+
  mapTheme3,
ggplot()+
  geom_sf(data=tx, fill='grey', color='black', size=0.5)+
  geom_sf(data=ep_co, fill=palette6[2])+
  theme(panel.background = element_rect(fill = "white"))+
  mapTheme2, top="El Paso - Context Map"
)

Current Bus Network

We utilized road centerline data and bus route data obtained from El Paso’s Open Data portal to understand the overall transit system’s network. The current network is visually represented below, where roads were depicted in black and the bus network was overlain in orange. We can see that the network coverage extended significantly towards areas surrounding downtown El Paso. However, several neighborhoods located in the eastern part of the city remain unserved by the current transit system. This highlights a potential gap in the transit service coverage of the city and underscores the need for equity considerations in ensuring any El Paso resident has access to transit.

road_centerlines <- road_centerlines %>%
  st_transform(st_crs(el_paso))

ep_roads <- st_intersection(road_centerlines, el_paso)

# ggplot()+
#   geom_sf(data=el_paso,
#           fill="lightgrey",
#           color=NA)+
#   geom_sf(data=ep_roads,
#           alpha=0.5, 
#           color="black", 
#           size=1.5)+
#   geom_sf(data=bus_routes,
#           alpha=0.9, 
#           color="#C96A52FF", 
#           size=1.5)+
#   ggsn::scalebar(el_paso, location="bottomleft", dist_unit = 'mi', dist=5, height=0.01, transform=TRUE, model='WGS84')+
#   labs(title="Current Bus Network",
#        subtitle="Sun Metro Mass Transit - El Paso, TX",
#        caption="Current road network in black\nCurrent bus transit network in orange")+
#   mapTheme2

Current Network Map

Analyzing Route Ridership

route_type_agg <- merge(RT_avg_sum_st, bus_routes, by.x = "RT", by.y = "ROUTE") %>% st_sf() %>%
  dplyr::select(c("RT", "type", "geometry")) %>% st_transform(crs=4269)

localRT <- subset(route_type_agg, type %in% 'Local')
circulatorRT <- subset(route_type_agg, type %in% 'Circulator')
expressRT <- subset(route_type_agg, type %in% 'Express')
feederRT <- subset(route_type_agg, type %in% "Feeder")

grid.arrange(ncol=3,
ggplot()+
  geom_sf(data=el_paso, fill="black")+
  geom_sf(data=brio2, color=palette6[1], fill=palette6[1], size=3)+
  labs(title="BRIO") + mapTheme,
ggplot()+geom_sf(data=el_paso, fill="black")+
  geom_sf(data=circulatorRT, color=palette6[2], fill=palette6[2], size=3)+
  labs(title="Circulator") + mapTheme,
ggplot()+geom_sf(data=el_paso, fill="black")+
  geom_sf(data=expressRT, color=palette6[3], fill=palette6[3], size=3)+
  labs(title="Express") + mapTheme,
ggplot()+geom_sf(data=el_paso, fill="black")+
  geom_sf(data=feederRT, color=palette6[4], fill=palette6[4], size=3)+
  labs(title="Feeder") + mapTheme,
ggplot()+geom_sf(data=el_paso, fill="black")+
  geom_sf(data=localRT, color=palette6[5], fill=palette6[5], size=3)+
  labs(title="Local") + mapTheme
)

Next, we looked at current route ridership. We aggregated the dataset to the route level and calculated the total ridership of each route.

#merged <- merge(transit_lines, RT_avg_sum_st, by.x = "route_short_name", by.y = "RT")

RT_avg_sum_st <- RT_avg_sum_st[order(-RT_avg_sum_st$total_ridership),]

local_RT_avg <- RT_avg_sum_st[RT_avg_sum_st$type %in% "Local", ]

ggplot(RT_avg_sum_st, aes(y = total_ridership, x = reorder(factor(RT), -total_ridership), fill = type)) +
  geom_bar(stat = "identity") +
  xlab("Total Ridership") +
  ylab("Route") +
  scale_fill_manual(values = palette7, name="Type of Bus", expand=c(0.2, 0)) +
  labs(title="Total Ridership by Route and Type",
       caption="Data: El Paso Capital Improvements Department, 2022")+
  plotTheme+
  theme(axis.text.x = element_text(angle=90))

percentage <- RT_avg_sum_st %>%
  group_by(type) %>%
  summarize(
    numOfRoutes = n(),
    pctOfSystemRidersip = round(sum(total_ridership) / sum(RT_avg_sum_st$total_ridership) * 100, 2))

pctKable <- percentage %>%
  kable(format = "html", escape = FALSE) %>%
  kable_styling(bootstrap_options = c("striped")) %>%
  row_spec(1, bold = T, color = "white", background = palette6[1]) %>%
  row_spec(2, bold = T, color = "white", background = palette6[2]) %>%
  row_spec(3, bold = T, color = "white", background = palette6[3])%>%
  row_spec(4, bold = T, color = "white", background = palette6[4])%>%
  row_spec(5, bold = T, color = "white", background = palette6[5])%>%
  row_spec(6, bold = T, color = "white", background = palette6[6]) 
pctKable

type	numOfRoutes	pctOfSystemRidersip
BRIO	4	39.54
Circulator	3	1.14
Express	5	10.37
Feeder	12	6.86
Local	31	41.31
Streetcar	1	0.79

BRIO

As stated before, the BRIO bus rapid transit routes dominate in ridership, accounting for approximately 40% of total transit ridership. The three most popular routes are the 205, 206, and 207. The fourth BRIO line, opened in October 2022, still had higher ridership than almost half of other routes. Because they have the most frequent service and connect to 28 out of the 31 local routes, we can logically derive why their ridership patterns are so high.

Route 59

Route 59 is one of the five express routes that experiences high ridership. According to the state of the system report in July 2022, Route 59 accounted for high ridership, notably for its service frequency and its connectivity between downtown and the Eastside Transit Center.

Local

Local routes make up the majority of transit roadway infrastructure, and they are the only routes that are unfixed. They also have the most political malleability - transit planning is a very political act, and can often have a lot of bias in the route methodology, but through our data-oriented approach, we can successfully build an unbiased informational network so transit planners can leverage our findings to build a more interconnected, equitable route network.

While there are a handful of local ridership routes that have comparable ridership to higher demand routes, like BRIO and Route 59, the trend teeters out, and most local routes experience relatively low ridership. This can be attributed to a variety of factors: there are low demand in these areas, infrequent service, inaccessibility, or other.

For the purposes of modeling bus transit scenarios, we leverage only local route ridership into our modeling process; however, it is important to analyze the system at large to understand the connectivity of Sun Metro and how local routes can better facilitate increased ridership and greater accessibility throughout the city.

Ridership Scatterplot

As the scatter plot indicates, there is a strong correlation between average on-board and average off-board frequency. With an r-squared of 0.92, this indicates that in bus transit trips, the average rider may exhibit round-trip behavior, boarding the bus at point A to travel to point B and then returning to point A without any intermediate stops.

ggplot(RT_stop_avg_st, aes(x = avg_ons, y = avg_offs)) +
  geom_point() +
  labs(x = "Average on-boarding", 
       y = "Average off-boarding", 
       title = "Scatterplot of on-boarding and off-boarding", 
       subtitle="Average values grouped by route and stop information",
       caption="Data: El Paso Capital Improvements Department, 2022")+
  geom_smooth(method = lm, se=FALSE, colour = "#C96A52FF", size=1, )+
  geom_abline(intercept=0, slope=1, color="#C96A52FF", alpha=0.3, size=1, style='dashed')+
  annotate("text", x = Inf, y = Inf, hjust = 5, vjust = 5,
           label = paste0("R^2 = ", round(summary(lm(avg_ons ~ avg_offs, data = RT_stop_avg_st))$r.squared, 3)))+
  plotTheme

Ridership by Census Tract

Ridership of each stop is aggregated into the census tracts of El Paso (2019) to understand spatial clustering of ridership. Total ridership of each tract is separated into quintiles. As we see below, there is some pretty significant clustering of high ridership in the downtown region, with high ridership falling along significant BRIO corridors. The further we sprawl out into the east and north, the lower ridership is.

local1 <- df[df$type %in% "Local", ]

local_agg <- local1 %>%
  group_by(RT, TP, longitude, latitude) %>%
  summarize(total_ons = sum(Ons),
            total_offs = sum(Offs)) %>%
  mutate(total_ridership = (total_ons + total_offs)) %>%
  dplyr::select(-c("total_ons", "total_offs"))

local_agg <- local_agg %>%
  dplyr::select(c("geometry", "total_ridership")) %>%
  st_transform(crs=4269)

ep_tracts <- ep_tracts %>% # EP TRACTS ARE NOW CRS 4269
  st_transform(crs=4269)

local_agg_tracts <- st_join(local_agg, ep_tracts, join = st_within)

local_agg_tracts <- local_agg_tracts %>%
  group_by(GEOID) %>%
  summarise(total_ridership = sum(total_ridership))

local_agg_tracts <- local_agg_tracts %>%
  st_drop_geometry()

agg_data2 <- left_join(local_agg_tracts, ep_tracts, by = "GEOID") %>%
  st_as_sf() %>%
  st_transform(crs=4269) %>%
  dplyr::select(c("total_ridership", "geometry", "GEOID"))

ggplot()+ 
  geom_sf(data = el_paso, fill="lightgrey", color="black")+
  geom_sf(data = agg_data2, aes(fill = q5(total_ridership)), color=NA)+
  geom_sf(data = brio, fill="black", color="black", size=1)+
  scale_fill_manual(values = palette5cont, 
                    labels = c("Bottom 20%", "20% to 40%", "40% to 60%", "60% to 80%", "Top 20%"),
                    name="Ridership\nQuintiles") +
  labs(title="Total Local Ridership by Census Tract",
       subtitle="El Paso, TX (2022)",
       caption="BRIO Routes in black")+
  mapTheme3

Along the southwest border, there are several census tracts, that despite having a BRIO line running directly through them, still experience some of the lowest local ridership in the city. This could be a potential area of interest to invest new local ridership in.

Leaflet Map - Total Ridership

Finally, we created a Leaflet map to allow interactivity with El Paso, and broke down total local ridership in 2022 to a quintile level. You can check out the map with this link.

Leaflet Map - Ridership for year 2022 at stop and route level, divided by quintiles

Some insights we derived from this map:

Downtown clustering: High ridership can be found in the downtown region of El Paso, given the population and job density, high density of stops and intersections, the convergence of multiple BRIO routes, and the downtown transfer bay.
High ridership along BRIO corridors: since several local routes connect to essential BRIO corridors, you can observe high ridership along many of these roads, such as Mesa and Montana. Leveraging connectivity to existing BRIO routes can encourage multi-modality and provide greater access to more frequent service provision.
Sprawl and ridership: The further we sprawl to the north and east, we see less overall ridership, and less frequency of stops to fall into higher ridership quintiles.
Southeast/I-10 corridor: Along the southeastern part of the city along the Mexico border, we see a clustering of stops with very low ridership, despite having several local routes connecting this region and a BRIO line. This may be for a variety of reasons such as low frequency in existing routes or high automobile dependency in this area.

Feature Engineering

Our feature engineering process is broken into two parts: an explanation of our spatial unit and the features we chose to aggregate.

Spatial Unit: Hexbins

Hexbins are a way to create a uniform spatial structure across our study region for the sake of creating a granular analysis of our region. We aggregate total ridership into each hexbin, split our data into training and test sets, and get predicted ridership for each hexagon.

We delineated our hexbins by cropping to only existing transit line infrastructure and removing land uses that would be non-conducive to transit ridership. We also removed hexes that posed great outliers: particularly a downtown hex and hexes with transfer bays. We deemed this necessary to improve both the accuracy of the model while knowing transit planners already know to link bus routes with essential connections such as these regions. Our final hexbin map is presented below.

ggplot()+
  geom_sf(data=el_paso, fill='grey')+
  geom_sf(data=hex_final, fill=c(palette1), alpha=0.7)+
  labs(title="Hex Bin Grid", subtitle = "El Paso, Texas")+
  mapTheme

Most of our hexes have zero or close to zero observations in terms of ridership, so we logarithmically transform the ridership below, showing it has a relatively normal distribution.

ggplot(hex_final, aes(x = log(ridership))) +
  geom_histogram(color = "black", fill = palette1) +
  labs(title = "Logarithmic Distribution of Ridership per Hex",
       subtitle = "El Paso, TX",
       x = "log(ridership)",
       y = "Count")+
  plotTheme

American Community Survey (2019)

We utilized the census API to gather a range of demographic information, socioeconomic status, household income and occupancy rates, education, and travel pattern characteristics, among other variables. This data will provide valuable insights into the equity and accessibility implications of our model, as marginalized populations often heavily rely on public transit but may face barriers in utilizing buses efficiently. We aimed to explore the spatial distribution of these demographics to identify any potential spatial relationships and understand their potential impact on bus transit demand.

censusvarsEP <- c(
  "B01001_001E", # ACS total Pop estimate
  "B01001I_001E", # Population - Hispanic or Latino 
  "B02001_002E", # Population - White alone
  "B02001_003E", # Population - Black or African American alone
  "B02001_004E", # Population - American Indian and Alaska Native alone
  "B02001_005E", # Population - Asian alone
  "B02001_006E", # Population - Native Hawaiian and Other Pacific Islander alone
  "B02001_007E", # Population - Some other race alone
  "B02001_008E", # Population - Two or more races
  "B19013_001E", # Median household income in the past 12 months (in 2019 inflation-adjusted dollars)
  "B25001_001E", # Total housing units
  "B25002_002E", # Occupancy status - Occupied housing units
  "B25024_001E", # Gross rent as a percentage of household income in the past 12 months
  "B25044_001E", # Vehicles available
  "B28005_001E", # Means of transportation to work by age
  "B28010_001E", # Commuting time to work (in minutes)
  "B10052_002E", # Disability
  "B06009_002E", # Less than high school
  "B06009_003E", # High School
  "B06009_004E", # Associates or equivalent
  "B06009_005E", # Bachelors
  "B06009_006E" # Graduate or prof. degree
)

acs_ep <- get_acs(geography = "tract",
                  year = 2019, 
                  variables = censusvarsEP, 
                  geometry = T,
                  state = "TX", 
                  county = "El Paso", 
                  output = "wide") 


acs_ep <- acs_ep %>%
  rename(
    totalPop = B01001_001E, 
    hlPop = B01001I_001E, 
    whitePop = B02001_002E, 
    blackPop = B02001_003E,
    aiPop = B02001_004E,
    asianPop = B02001_005E,
    nhPop = B02001_006E,
    otherRacePop = B02001_007E,
    twoPlusRacePop = B02001_008E,
    medHHInc = B19013_001E,
    totalHU = B25001_001E, 
    occupiedHU = B25002_002E, 
    grossRentPerInc = B25024_001E, 
    transByAge = B28005_001E,
    commuteToWork = B28010_001E,
    lessThanHS = B06009_002E,
    highSchool = B06009_003E,
    associatesDeg = B06009_004E,
    bachelorDeg = B06009_005E,
    professionalDeg = B06009_006E,
    disability = B10052_002E) %>%
  dplyr::select(-c("B01001_001M", "B01001I_001M", "B02001_002M", "B02001_003M", "B02001_004M", "B02001_005M", "B02001_006M", "B02001_007M", "B02001_008M", "B19013_001M", "B25001_001M", "B25002_002M", "B25024_001M", "B25044_001M", "B28005_001M", "B28010_001M","B10052_002M", "B06009_002M", "B06009_003M", "B06009_004M", "B06009_005M", "B06009_006M")) 


# %>%
#   mutate(area_sqmile = st_area(geometry)/2590000) %>%  
#   st_as_sf(crs = 4269) %>%
#   mutate_at(vars(-dontdiv), funs(as.numeric(./area_sqmile)))
# 
acs_ep <- acs_ep %>%
  mutate(
    whitePopPct = (whitePop/totalPop)*100,
    hlPopPct = (hlPop/totalPop)*100,
    asianPopPct = (asianPop/totalPop)*100,
    blackPopPct = (blackPop/totalPop)*100,
   otherRacePopPct = (otherRacePop/totalPop)*100,
    nhPopPct = (nhPop/totalPop)*100,
    aiPopPct = (aiPop/totalPop)*100,
    occupiedHUPct = (occupiedHU/totalHU)*100,
    vacantHUPct = ((totalHU-occupiedHU)/totalHU)*100
  ) 


acs_ep <- na.omit(acs_ep)

st_crs(acs_ep) <- st_crs(el_paso)
acs_ep <- st_intersection(acs_ep, el_paso) %>% 
  dplyr::select(-c("NAME"
                 ))


acs_ep_plots <- acs_ep %>% # For EDA/features plots
  mutate(
    whitePopPct = (whitePop/totalPop)*100,
    hlPopPct = (hlPop/totalPop)*100,
    asianPopPct = (asianPop/totalPop)*100,
    blackPopPct = (blackPop/totalPop)*100,
    occupiedHUPct = (occupiedHU/totalHU)*100,
    vacantHUPct = ((totalHU-occupiedHU)/totalHU)*100
  )


acs_ep_plots <- st_intersection(el_paso, acs_ep_plots)

agg_data_st <- agg_data2 %>%
  st_drop_geometry()


acs_extf_cols <- c("whitePopPct", "hlPopPct","asianPopPct",   "blackPopPct", "otherRacePopPct", "nhPopPct", "aiPopPct")

acs_ep_hex <- acs_ep %>% dplyr::select(-c("OBJECTID", "NAME.1", "GlobalID", "Shape_Length", "Shape_Area","B25044_001E"
                 ),-GEOID, -medHHInc) %>% st_interpolate_aw(hex, extensive = T) %>%
  mutate(uniqueID = as.numeric(rownames(.)))

Racial Distribution in El Paso

Six maps of the total, White, Hispanic/Latino, Black, Asian, and other race populations are below. It appears that there is racial segregation, as census tracts with the greatest density of Black residents seem to have decreased amount of White and Latino residents. There also is a strip along the Rio Grande in the southeastern part of the city where there is an incredibly high density of Hispanic/Latino residents. A high density of Asian residents seem to live just northwest of downtown. With an interest in ensuring all demographic groups have access to transit, understanding racial distribution is essential to design routes that equitably serve all races.

grid.arrange(ncol=3,
             ggplot()+
               geom_sf(data=hex_final, aes(fill=totalPop), color=NA)+
               scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="per\nsq km")+
               labs(title="Total population")+
               mapTheme3,
             ggplot()+
               geom_sf(data=hex_final, aes(fill=whitePop), color=NA)+
               scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="per\nsq km")+
               labs(title="White population")+
               mapTheme3,
             ggplot()+
               geom_sf(data=hex_final, aes(fill=blackPop), color=NA)+
               scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="per \nsq km")+
               labs(title="Black population")+
               mapTheme3,
             ggplot()+
               geom_sf(data=hex_final, aes(fill=asianPop), color=NA)+
               scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="per \nsq km")+
               labs(title="Asian population")+
               mapTheme3,
             ggplot()+
               geom_sf(data=hex_final, aes(fill=hlPop), color=NA)+
               scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="per \nsq km")+
               labs(title="Hispanic/Latino population")+
               mapTheme3,
             ggplot()+
               geom_sf(data=hex_final, aes(fill=nhPop+otherRacePop+aiPop+twoPlusRacePop), color=NA)+
               scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="per \nsq km")+
               labs(title="Other Race population")+
               mapTheme3,
             top="Racial Distribution in El Paso, TX", bottom="Data: ACS, 2019"
)

Household Vacancy

Vacant households may contribute to ridership demand; if units are unoccupied, there is a decreased density of residents and thus less people who may need to utilize buses. There seems to be a relatively random spatial pattern on percentage of vacant housing units. It appears more densified areas such as downtown see higher rates of vacancy, with high vacanies also along some of the popular BRIO corridors into the north and east.

ggplot()+
  geom_sf(data=hex_final, aes(fill=q5(vacantHUPct)), color=NA)+
  scale_fill_manual(values = palette5cont, 
                    labels = c("Bottom 20%", "20 to 40%", "40 to 60%", "60 to 80%", "Top 20%"),
                    name="% Vacancy\nQuintiles") +
  labs(title="Vacant Housing Units",
       subtitle="El Paso, TX",
       caption="Data: ACS, 2019")+
  mapTheme3

Median Household Income

Additionally, we want to equitably serve people of all socioeconomic statuses with transit, while also ensuring areas of disinvestment/lower median household income gain greater capital investments in transit infrastructure. There is a spatial clustering effect here: highest income in the sprawling north and east and lowest income in downtown into the southeast. It appears that some of the lowest transit ridership regions also have the lowest median household income, indicating either a need for increased supply or services, or that demand for transit may not exist in these regions.

ggplot()+
  geom_sf(data=hex_final, aes(fill=q5(medHHInc)), color=NA)+
  scale_fill_manual(values = palette5cont, 
                    labels = (qBr(hex_final, "medHHInc")),
                    name="Income by\nQuintiles") +
  labs(title="Median Household Income",
       subtitle="El Paso, TX",
       caption="Data: ACS, 2019")+
  mapTheme3

National Walkability Index (2019)

A major assumption of our model is that land use patterns and urban design play a huge role in people’s accessibility and desire to use transit, or their dependency on specific modes of transportation given inaccessibility of other modes. The National Walkability Index is a geographic resource that scores every block group in the U.S. on its relative walkability. A variety of factors explain walkability, including sidewalk availability, infrastructural assessment, street connectivity, amenity proximity, and urban form conducive to walkable neighborhoods.

From the Environmental Protection Agency, we derived the National Walkability Index (NWI) scores for 2019 at the block group level. This included a variety of indicators, including vehicle ownership, density of employment, jobs, and households, intersection and transit stop density, and regional and destination accessibility.

nwi <- read_csv(paste(data_folder, "/nwi2019.csv", sep = '')) %>%
  dplyr::filter(CBSA_Name == "El Paso, TX") %>%
  dplyr::select(c("OBJECTID",
                  "GEOID10",
                  "TRACTCE",
                  "BLKGRPCE",
                  "CountHU", # number of housing units
                  "P_WrkAge", # percent working age pop. (18-64)
                  "HH", # households
                  "Pct_AO0", # % of HH owning 0 vehicles
                  "Pct_AO1", # % of HH owning 1 vehicle
                  "Pct_AO2p", # % of HH owning 2+ vehicles
                  "R_PCTLOWWAGE", # % of low wage workers
                  "D1C", # employment density
                  "D2R_JOBPOP", # jobs to population balance
                  "D2B_E8MIXA", # employment mix
                  "D2A_EPHHM", # employment + household mix
                  "D3B", # intersection density
                  "D4A", # distance from population-weighted centroid to nearest transit stop
                  "D5AR", # regional accessibility for access to jobs
                  "D5BR", # destination accessibility via transit
  )) %>% mutate(tractblock = as.numeric(TRACTCE)*10+as.numeric(BLKGRPCE))

ep_blocks <-read_sf(paste(data_folder, "/tl_2019_48_bg.shp", sep = '')) %>%
  dplyr::filter(COUNTYFP == 141) %>%
  dplyr::select(c("TRACTCE", "BLKGRPCE")) %>% mutate(tractblock = as.numeric(TRACTCE)*10+as.numeric(BLKGRPCE))

nwi_bg <- left_join(nwi, ep_blocks, by = "tractblock") %>% st_sf() %>%
  rename(
    housingUnits = CountHU,
    pctWorkingAge = P_WrkAge,
    housingUnits = CountHU,
    pct0Car = Pct_AO0,
    pct1Car = Pct_AO1,
    pct2Car = Pct_AO2p,
    pctLowWage = R_PCTLOWWAGE,
    employmentDensity = D1C,
    jobsPopBalance = D2R_JOBPOP,
    employmentMix = D2B_E8MIXA,
    employmentHHMix = D2A_EPHHM,
    intersectionDensity = D3B,
    nearestTransit = D4A,
    jobAccessibility = D5AR,
    destinationAccessibility = D5BR
  ) %>%
  dplyr::select(-c("OBJECTID",
                   "GEOID10",
                   "TRACTCE.x",
                   "TRACTCE.y",
                  "BLKGRPCE.y",
                   "BLKGRPCE.x",
                  "tractblock")) %>% 
  st_transform(st_crs(el_paso))



nwi_bg_NA <- st_intersection(nwi_bg, el_paso) %>%
  dplyr::filter(nearestTransit != -99999.00)


nwi_bg_st_NA <- st_drop_geometry(nwi_bg_NA)

Nearest Transit Accessibility

The nearest transit accessibility score is calculated as the distance (in meters) from the population-weighted centroid. This involves aggregating all bus stops within each census block into the block, determining the centroid of the block based on population distribution, and calculating the distance between the centroid and its nearest stop. This method takes into account accessibility for the greatest number of people within the block, which is its strength. However, there may be issues with the choice of areal unit.

For the purpose of visualization, census blocks that do not have a transit stop within them were reassigned a value of -1, as they were originally assigned a value of -99999, heavily skewing the data. From the remaining blocks, the histogram shows a unimodal distribution with a peak around 400 meters, indicating that a transit stop is within 0.25 miles from the population-weighted centroid for the majority of census block groups. While this generally aligns with the metric of transit accessibility, which is often measured as a transit stop within 0.25 or 0.5 miles of home, it is challenging to determine if this is the most effective measure for transit accessibility given it does not look at data at the household level.

grid.arrange(ncol=2,
ggplot(hex_final, aes(x = nearestTransit)) +
  geom_histogram(color = "black", fill = palette1) +
  labs(title = "Nearest Transit Stop Histogram",
       subtitle = "El Paso, TX (Hexbins)",
       x = "Distance (meters)",
       y = "Count",
       caption = "Data: National Walkability Index (2019)")+
  plotTheme,
ggplot()+
  geom_sf(data=hex_final, aes(fill=q5(nearestTransit)), color=NA)+
  scale_fill_manual(values = palette5cont, 
                    labels = (qBr(hex_final, "nearestTransit")),
                    name="Distance (meters)") +
 # scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance\n(meters)")+
  labs(title="Nearest Transit Stop Map",
       subtitle="El Paso, TX (Hexbins)")+
  mapTheme3+
  theme(legend.position = "bottom")
)

Destination Accessibility

The destination accessibility metric is measured as jobs within 45-minute transit commute, accounting for factors such as distance decay (accessibility of jobs may decrease with increasing distance from a particular location) and time it takes to walk to a transit stop. General Transit Feed Specification (GTFS) data was used in tandem to calculate this metric.

We can see where there is increased density of jobs (e.g. downtown, UTEP), the metric is higher.

grid.arrange(ncol=2,
ggplot(hex_final, aes(x = destinationAccessibility)) +
  geom_histogram(color = "black", fill = palette1) +
  labs(title = "Destination Accessibility Stop Histogram",
       subtitle = "El Paso, TX (Hexbins)",
       x = "Score",
       y = "Count",
       caption = "Data: National Walkability Index (2019)")+
  plotTheme,
ggplot()+
  geom_sf(data=hex_final, aes(fill=q5(destinationAccessibility)), color=NA)+
  scale_fill_manual(values = palette5cont, 
                    labels = (qBr(hex_final, "destinationAccessibility")),
                    name="Score") +
 # scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance\n(meters)")+
  labs(title="Destination Accessibility Stop Map",
       subtitle="El Paso, TX (Hexbins)")+
  mapTheme3+
  theme(legend.position = "bottom")
)

Intersection Density (D3B)

Intersection density could be a useful transit ridership metric given that if much existing roadway infrastructure already exists could indicate potential for increased ridership, ease of new route formation, and transit-oriented development potential. Moreover, intersection density can be conducive to walkability, urban density, transit demand, and trip chaining.

grid.arrange(ncol=2,
ggplot(hex_final, aes(x = intersectionDensity)) +
  geom_histogram(color = "black", fill = palette1) +
  labs(title = "Intersection Density Stop Histogram",
       subtitle = "El Paso, TX (Hexbins)",
       x = "Score",
       y = "Count",
       caption = "Data: National Walkability Index (2019)")+
  plotTheme,
ggplot()+
  geom_sf(data=hex_final, aes(fill=q5(intersectionDensity)), color=NA)+
  scale_fill_manual(values = palette5cont, 
                    labels = (qBr(hex_final, "intersectionDensity")),
                    name="Score") +
 # scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance\n(meters)")+
  labs(title="Intersection Density Stop Map",
       subtitle="El Paso, TX (Hexbins)")+
  mapTheme3+
  theme(legend.position = "bottom")
)

Employment Diversity to Occupied Housing

Finally, we also look at the employment to household mix score. This illustrates the diversity of employment sectors and job types to occupied housing units within the census block, which is scored on a scale of 0 to 1. This is a useful metric looking at both ridership and equity: areas of increased household occupancy and employment opportunity may increase ridership demand while diversity in job types allows equitable access to transit ridership opportunity. In the map below, we see some clustering of high scores in the northwest, which does have UT-El Paso and a high density of higher income, and some essential shopping and entertainment corridors in the east.

grid.arrange(ncol=2,
ggplot(hex_final, aes(x = employmentHHMix)) +
  geom_histogram(color = "black", fill = palette1) +
  labs(title = "Employment/HH Mix Histogram",
       subtitle = "El Paso, TX (Hexbins)",
       x = "Score",
       y = "Count",
       caption = "Data: National Walkability Index (2019)")+
  plotTheme,
ggplot()+
  geom_sf(data=hex_final, aes(fill=q5(employmentHHMix)), color=NA)+
  scale_fill_manual(values = palette5cont, 
                    labels = (qBr(hex_final, "employmentHHMix")),
                    name="Score") +
 # scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance\n(meters)")+
  labs(title="Employment/HH Mix Map",
       subtitle="El Paso, TX (Hexbins)")+
  mapTheme3+
  theme(legend.position = "bottom")
)

Longitudinal Employment-Household Data (LEHD)

Longitudinal Employment-Household Data (LEHD) is a dataset that provides detailed information on employment and earnings in the United States. It is produced by the U.S. Census Bureau in partnership with state labor market agencies to help create a comprehensive, state-level picture of employment and earning patterns over time.

We accessed LEHD for the year 2019 from the lehdr package for the state of Texas at the census tract level, and aggregated features related to number of jobs; age, race, gender, educational level of workers; income of workers; and job sector distribution. We decided LEHD would be a useful mechanism to bake equity and accessibility features into our model. Considering different job sectors and worker demographics may have increased demand for transit (e.g., tracts with higher low wage workers, lower-skilled labor, etc.), it will likely increase the predictive power of transit ridership.

#https://lehd.ces.census.gov/data/lodes/LODES7/LODESTechDoc7.5.pdf

#get lhodes work place data
tx_work <- grab_lodes(state = "tx", 
                      year = 2019, 
                      lodes_type = "wac", 
                      job_type = "JT01", 
                      segment = "S000", 
                      state_part = "main", 
                      agg_geo = "tract") %>% 
  rename(total_jobs = C000,
         age_29_or_younger = CA01,
         age_30_to_54 = CA02,
         age_55_or_older = CA03,
         monthly_income_1250_or_less = CE01,
         monthly_income_1251_to_3333 = CE02,
         monthly_income_3334_or_more = CE03,
         agriculture = CNS01, # agriculture, forestry, fishing, hunting / NAICS11
         mining = CNS02, # mining, oil/gas extraction / MAICS21
         utilities = CNS03, # utilities / NAICS22
         construction = CNS04, # construction / NAICS23
         manufacturing = CNS05, # manufacturing /NAICS31 and NAICS33
         wholesaleTrade = CNS06, # wholesale trade / NAICS42
         retailTrade = CNS07, # retail trade / 44/46
         transportationWarehousing = CNS08, # transportation and warehousing / 48, 49
         informationTech = CNS09, # information /51
         financeInsurance = CNS10, # finance and insurance /52
         realEstate = CNS11, # real estate /53
         techServices = CNS12, # professional, scientific, and tech services /54
         management = CNS13, # management (enterprise) /55
         wasteRemediation = CNS14, # waste / remediation /56
         educational = CNS15, # education /61
         healthcareSocial = CNS16, # healthcare/social services /62
         artsEntertainmentRec = CNS17, # arts/entertainment/rec /71
         foodServices = CNS18, # food services /72
         otherJobs = CNS19, # other /81
         publicAdmin = CNS20, # public admin /92
         white_work = CR01,
         black_work = CR02,
         native_american_work = CR03,
         asian_work = CR04,
         pacific_work = CR05,
         mixed_race_work = CR07,
         not_hispanic_work = CT01,
         hispanic_work = CT02,
         male_work = CS01,
         female_work = CS02,
         underHS_work = CD01,
         HS_work = CD02,
         associate_work = CD03,
         bachProfessional_work = CD04)

tx_work <- tx_work %>%
  mutate(
    age29youngerPct = (age_29_or_younger/total_jobs),
    age30to54Pct = (age_30_to_54/total_jobs),
    age55olderPct = (age_55_or_older/total_jobs),
    income1250LessPct = (monthly_income_1250_or_less/total_jobs),
    income1251to3333Pct = (monthly_income_1251_to_3333/total_jobs),
    income3333OverPct = (monthly_income_3334_or_more/total_jobs),
    agriculturePct = (agriculture/total_jobs),
    miningPct = (mining/total_jobs),
    utilitiesPct = (utilities/total_jobs),
    constructionPct = (construction/total_jobs),
    manufacturingPct = (manufacturing/total_jobs),
    wholesaleTradePct = (wholesaleTrade/total_jobs),
    retailTradePct = (retailTrade/total_jobs),
    transportationWarehousingPct = (transportationWarehousing/total_jobs),
    informationTechPct = (informationTech/total_jobs),
    financeInsurancePct = (financeInsurance/total_jobs),
    realEstatePct = (realEstate/total_jobs),
    techServicesPct = (techServices/total_jobs),
    managementPct = (management/total_jobs),
    wasteRemediationPct = (wasteRemediation/total_jobs),           
    educationalPct = (educational/total_jobs),               
    healthcareSocialPct = (healthcareSocial/total_jobs),           
    artsEntertainmentRecPct = (artsEntertainmentRec/total_jobs),       
    foodServicesPct = (foodServices/total_jobs),
    otherJobsPct = (otherJobs/total_jobs),
    publicAdminPct = (publicAdmin/total_jobs),
    whiteWorkPct = (white_work/total_jobs),
    blackWorkPct = (black_work/total_jobs),
    nativeAmericanWorkPct = (native_american_work/total_jobs),
    asianWorkPct = (asian_work/total_jobs),
    pacificWorkPct = (pacific_work/total_jobs),
    mixedRaceWorkPct = (mixed_race_work/total_jobs),
    notHispanicWorkPct = (not_hispanic_work/total_jobs),
    hispanicWorkPct = (hispanic_work/total_jobs),
    underHSWorkPct = (underHS_work/total_jobs),
    hsWorkPct = (HS_work/total_jobs),
    associateWorkPct = (associate_work/total_jobs),
    bachProfessionalWorkPct = (bachProfessional_work/total_jobs),
    maleWorkPct = (male_work/total_jobs),
    femaleWorkPct = (female_work/total_jobs)
  )

lehd_ep <- left_join(ep_tracts, tx_work, by=c('GEOID' = 'w_tract')) %>%
  dplyr::select(-c(,"GEOID","CFA01", "CFA02", "CFA03", "CFA04", "CFA05", "CFS01", "CFS02", "CFS03", "CFS04", "CFS05", "area_sqmile", "year", "state"))

#ep_work_census <- ep_work_census %>%
 # select(-starts_with("CF"))

lhed_ep_st <- lehd_ep %>%
  st_drop_geometry()

Income Distribution of Jobs - Monthly Income

We calculated the percentage of each monthly income bracket of the census tracts - which in LEHD are divided as making under 1250 per month, between 1251 and 3333 per month, or over 3334 per month. There appears to be small clusters of each income bracket found throughout the map; specifically similar clustering patterns among the lowest and middle income brackets with much different clusters in the higher income bracket.

grid.arrange(ncol=3,
ggplot()+
  geom_sf(data=hex_final, aes(fill=income1250LessPct*100), color=NA)+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="%")+
  labs(title="Under $1250",
       caption="Data: LEHD, 2019")+
  mapTheme3+
  theme(legend.position = 'bottom'),
ggplot()+
  geom_sf(data=hex_final, aes(fill=income1251to3333Pct*100), color=NA)+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="%")+
  labs(title="Between $1251 to $3333",
       caption="Data: LEHD, 2019")+
  mapTheme3+
  theme(legend.position = 'bottom'),
ggplot()+
  geom_sf(data=hex_final, aes(fill=income3333OverPct*100), color=NA)+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="%")+
  labs(title="Over $3334",
       caption="Data: LEHD, 2019")+
  mapTheme3+
  theme(legend.position = 'bottom')
)

Age Distribution of Jobs

Additionally, we calculated the percentage of each age bracket of the census tracts - which in LEHD are divided as under 29, 30 to 54, or over 55. Similarly to the income bracket, there seems to be spatial clustering in similar locations for ages under 54 and over 55.

grid.arrange(ncol=3, top="Age Bracket Distribution of Jobs",
ggplot()+
  geom_sf(data=hex_final, aes(fill=(age29youngerPct*100)), color=NA)+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="%")+
  labs(title="Age < 29",
       caption="Data: LEHD, 2019")+
  mapTheme3+
  theme(legend.position = 'bottom'),
ggplot()+
  geom_sf(data=hex_final, aes(fill=(age30to54Pct*100)), color=NA)+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="%")+
  labs(title="Age 30 to 54",
       caption="Data: LEHD, 2019")+
  mapTheme3+
  theme(legend.position = 'bottom'),
ggplot()+
  geom_sf(data=hex_final, aes(fill=(age55olderPct*100)), color=NA)+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="%")+
  labs(title="Age 55+",
       caption="Data: LEHD, 2019")+
  mapTheme3+
  theme(legend.position = 'bottom')
)

Job Types

LEHD also provides job sector data, with each hex presented below with percent distribution of type of job.

epworkjobs <- lehd_ep[, c("agriculturePct", 
                          "miningPct", 
                          "utilitiesPct",
                          "constructionPct",
                          "manufacturingPct",
                          "wholesaleTradePct",
                          "retailTradePct",
                          "transportationWarehousingPct",
                          "informationTechPct",
                          "financeInsurancePct",
                          "realEstatePct",
                          "techServicesPct",
                          "managementPct",
                          "wasteRemediationPct",
                          "educationalPct",
                          "healthcareSocialPct",
                          "artsEntertainmentRecPct",
                          "foodServicesPct",
                          "otherJobsPct",
                          "publicAdminPct")] %>% st_drop_geometry() %>% mutate(across(2:20, ~ round(. * 100, 2)))

library(DT)
datatable(epworkjobs)

Open Data El Paso

Open Data El Paso is the city’s open-source data platform. We pulled several demographic and built environment features to engineer into our model that explain the impacts of transit demand. Particularly, we calculated 1st, 3rd, and 5th nearest neighbor distances for each of our features.

We gathered distance information for parks, schools, and roadway infrastructure. We assumed for parks and schools that people may use transit to access these amenities and diverse groups use and benefit from these services. We assumed major roadways are predictive of ridership as several transit routes with high ridership fall along major roadway corridors.

The nearest feature to each hexbin is mapped below.

schools <- schools %>%
  dplyr::filter(DISTRICT == "EL PASO") %>%
  dplyr::filter(TYPE_ != "ADMIN") %>%
  dplyr::filter(TYPE_ != "FACILITIES")

# ggplot() +
#   geom_sf(data = el_paso, fill = "grey", color = NA) +
#   geom_sf(data = final_hex, aes(color = TYPE_), size = 2, alpha=0.85) +
#   scale_color_manual(values = palette6, name = "Type") +
#   guides(color = guide_legend(override.aes = list(shape = 22, size = 5, fill=palette6))) +
#   labs(title="Schools",
#        subtitle="El Paso Independent School District",
#        caption="Source: Open Data El Paso, 2023")+
#   mapTheme

parks <- parks %>%
  dplyr::filter(CATEGORY %in% c("City Park", "Open Space", "Trail", "Trailhead"))
# # ggplot() +
#   geom_sf(data = el_paso, fill = "grey", color = NA) +
#   geom_sf(data = parks, aes(fill = CATEGORY, color = CATEGORY), size = 2, alpha=0.85) +
#   scale_fill_manual(values = palette6[1:4], name = "Type") +
#   scale_color_manual(values = palette6[1:4], name = "Type") +
#   guides(color = guide_legend(override.aes = list(shape = 22, size = 5, fill=palette6[1:4], color=palette6[1:4]))) +
#   labs(title="Parks",
#        subtitle="El Paso, TX",
#        caption="Source: Open Data El Paso, 2023")+
#   mapTheme
road_class <- c(
  "LOCAL" = "Local",
  "MINOR" = "Minor",
  "FREEWAY" = "Freeways",
  "MAJOR" = "Major",
  "COLLECTOR" = "Collector",
  "INTERSTATE" = "Interstate",
  "LOCAL\r\n" = "Local",
  "LOCAL\r\n\r\n" = "Local",
  "LOCAL\r\n\r\n\r\n" = "Local",
  "LOCAL\r\n\r\n\r\n\r\n" = "Local"
)

roads$CLASS <- road_class[roads$CLASS]

roads <- roads %>%
  filter(!is.na(CLASS))

roads_sf <- st_as_sf(roads, coords = NULL)

class_count <- table(roads_sf$CLASS)

class_count_df <- data.frame(CLASS = names(class_count), Count = as.vector(class_count))

# ggplot(class_count_df, aes(x = CLASS, y = Count, fill = CLASS)) +
#   geom_col() +
#   scale_fill_manual(values=c(palette6),
#                     name='Road Class')+
#   labs(title = "Road Classifications",
#        subtitle = "El Paso, TX",
#        caption = "Data: Open Data El Paso, 2023",
#        x = "Class",
#        y = "Count") +
#   plotTheme

major <- roads %>%
  dplyr::filter(CLASS == "Major") %>%
  st_transform(crs=4269)

majorEP <- st_intersection(el_paso, major)

# grid.arrange(ncol=3,
# ggplot() +
#   geom_sf(data = el_paso, fill = "grey", color = NA) +
#   geom_sf(data = schools, aes(color = TYPE_), size = 2, alpha=0.85) +
#   scale_color_manual(values = palette6, name = "Type") +
#   guides(color = guide_legend(override.aes = list(shape = 22, size = 5, fill=palette6))) +
#   labs(title="Schools",
#        subtitle="El Paso Independent School District",
#        caption="Source: Open Data El Paso, 2023")+
#   mapTheme+theme(legend.position= "bottom"),
# ggplot() +
#   geom_sf(data = el_paso, fill = "grey", color = NA) +
#   geom_sf(data = parks, aes(fill = CATEGORY, color = CATEGORY), size = 2, alpha=0.85) +
#   scale_fill_manual(values = palette6[1:4], name = "Type") +
#   scale_color_manual(values = palette6[1:4], name = "Type") +
#   guides(color = guide_legend(override.aes = list(shape = 22, size = 5, fill=palette6[1:4], color=palette6[1:4]))) +
#   labs(title="Parks",
#        subtitle="El Paso, TX",
#        caption="Source: Open Data El Paso, 2023")+
#   mapTheme+theme(legend.position= "bottom"),
# ggplot()+
#   geom_sf(data = el_paso, fill = "grey", color = NA) +
#   geom_sf(data = majorEP, aes(fill=CLASS, color=CLASS), size = 4, alpha=0.85) +
#   scale_fill_manual(values=palette1, name='Road Class')+
#   scale_color_manual(values=palette1, name='Road Class')+
#   labs(title="Major Roads",
#        subtitle="El Paso, TX",
#        caption="Data: Open Data El Paso, 2023")+
#   mapTheme+theme(legend.position= "bottom"))

grid.arrange(ncol=3,
ggplot()+
  geom_sf(data=hex_final, aes(fill=(schools.nn1)), color=NA)+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  labs(title="Distance to Nearest School", subtitle="El Paso, TX",
       caption="Data: Open Data El Paso")+
  mapTheme3+
  theme(legend.position = 'bottom'),
ggplot()+
  geom_sf(data=hex_final, aes(fill=(parks.nn1)), color=NA)+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  labs(title="Distance to Nearest Park", subtitle="El Paso, TX",
       caption="Data: Open Data El Paso")+
  mapTheme3+
  theme(legend.position = 'bottom'),
ggplot()+
  geom_sf(data=hex_final, aes(fill=(major_road.nn1)), color=NA)+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  labs(title="Distance to Nearest Major Roadway", subtitle="El Paso, TX",
       caption="Data: Open Data El Paso")+
  mapTheme3+
  theme(legend.position = 'bottom')
)

## Open Street Map

Open Street Map (OSM) is an open-source, collaborative, free geographic database in which users can aggregate features related to different environmental features of a place, such as buildings, land uses, and roadways. We hypothesized that distance to a variety of built environment features may be predictive of bus ridership. We chose features from keys “amenity” and “shop” listed below:

Amenities: hospital, clinic, pharmacy, restaurant, cafe, bar, place_of_worship, cinema, theatre
Shops: department_store, supermarket, mall

One thing to note about OSM is that since it is a volunteer-based platform, it is not guaranteed every feature that matches each value may yet be aggregated into Open Street Map; however, we included all of these features still give a high-level overview of amenity distribution in the city. We chose these amenities in particular as they all have “live, work, play” attributes; these amenities may be crucial to access (e.g. pharmacy, hospital) or for entertainment purposes (e.g. theatre, bar) or for employment opportunity (e.g. supermarket, clinic). Again, distance to these features was calculated, with the nearest of each amenity to each hexbin mapped below.

# Hospital
hospital <- opq(bbox = 'El Paso, Texas') %>%
  add_osm_feature(key = 'amenity', 
                  value = 'hospital') %>%
  osmdata_sf()

hospital_points <- hospital$osm_points

hospital_df <- as.data.frame(hospital_points) %>%
  st_as_sf() %>%
  st_transform(crs=4269)

hospital_points <- st_intersection(el_paso, hospital_df) %>%
  dplyr::select(c("geometry")) %>%
  mutate(type = "hospital")

# Clinic
clinic <- opq(bbox = 'El Paso, Texas') %>%
  add_osm_feature(key = 'amenity', 
                  value = 'clinic') %>%
  osmdata_sf()

clinic_points <- clinic$osm_points

clinic_df <- as.data.frame(clinic_points) %>%
  st_as_sf() %>%
  st_transform(crs=4269)

clinic_points <- st_intersection(el_paso, clinic_df) %>%
  dplyr::select(c("geometry")) %>%
  mutate(type = "clinic")

# Pharmacy 
pharmacy <- opq(bbox = 'El Paso, Texas') %>%
  add_osm_feature(key = 'amenity', 
                  value = 'pharmacy') %>%
  osmdata_sf()

pharmacy_points <- pharmacy$osm_points

pharmacy_df <- as.data.frame(pharmacy_points) %>%
  st_as_sf() %>%
  st_transform(crs=4269)

pharmacy_points <- st_intersection(el_paso, pharmacy_df) %>%
  dplyr::select(c("geometry")) %>%
  mutate(type = "pharmacy")

# Restaurant
restaurant <- opq(bbox = 'El Paso, Texas') %>%
  add_osm_feature(key = 'amenity', 
                  value = 'restaurant') %>%
  osmdata_sf()

restaurant_points <- restaurant$osm_points

restaurant_df <- as.data.frame(restaurant_points) %>%
  st_as_sf() %>%
  st_transform(crs=4269)

restaurant_points <- st_intersection(el_paso, restaurant_df) %>%
  dplyr::select(c("geometry")) %>%
  mutate(type = "restaurant")

# Cafe
cafe <- opq(bbox = 'El Paso, Texas') %>%
  add_osm_feature(key = 'amenity', 
                  value = 'cafe') %>%
  osmdata_sf()

cafe_points <- cafe$osm_points

cafe_df <- as.data.frame(cafe_points) %>%
  st_as_sf() %>%
  st_transform(crs=4269)

cafe_points <- st_intersection(el_paso, cafe_df) %>%
  dplyr::select(c("geometry")) %>%
  mutate(type = "cafe")

# Bars
bar <- opq(bbox = 'El Paso, Texas') %>%
  add_osm_feature(key = 'amenity', 
                  value = 'bar') %>%
  osmdata_sf()

bar_points <- bar$osm_points

bar_df <- as.data.frame(bar_points) %>%
  st_as_sf() %>%
  st_transform(crs=4269)

bar_points <- st_intersection(el_paso, bar_df) %>%
  dplyr::select(c("geometry")) %>%
  mutate(type = "bar")

# Places of Worship
worship <- opq(bbox = 'El Paso, Texas') %>%
  add_osm_feature(key = 'amenity', 
                  value = 'place_of_worship') %>%
  osmdata_sf()

worship_points <- worship$osm_points

worship_df <- as.data.frame(worship_points) %>%
  st_as_sf() %>%
  st_transform(crs=4269)

worship_points <- st_intersection(el_paso, worship_df) %>%
  dplyr::select(c("geometry")) %>%
  mutate(type = "worship")

# Cinema
cinema <- opq(bbox = 'El Paso, Texas') %>%
  add_osm_feature(key = 'amenity', 
                  value = 'cinema') %>%
  osmdata_sf()

cinema_points <- cinema$osm_points

cinema_df <- as.data.frame(cinema_points) %>%
  st_as_sf() %>%
  st_transform(crs=4269)

cinema_points <- st_intersection(el_paso, cinema_df) %>%
  dplyr::select(c("geometry")) %>%
  mutate(type = "cinema")

# Theaters
theatre <- opq(bbox = 'El Paso, Texas') %>%
  add_osm_feature(key = 'amenity', 
                  value = 'theatre') %>%
  osmdata_sf()

theatre_points <- theatre$osm_points

theatre_df <- as.data.frame(theatre_points) %>%
  st_as_sf() %>%
  st_transform(crs=4269)

theatre_points <- st_intersection(el_paso, theatre_df) %>%
  dplyr::select(c("geometry")) %>%
  mutate(type = "restaurant")

# Department Stores
department <- opq(bbox = 'El Paso, Texas') %>%
  add_osm_feature(key = 'shop', 
                  value = 'department_store') %>%
  osmdata_sf()

department_store_points <- department$osm_points

department_df <- as.data.frame(department_store_points) %>%
  st_as_sf() %>%
  st_transform(crs=4269)

department_store_points <- st_intersection(el_paso, department_df) %>%
  dplyr::select(c("geometry")) %>%
  mutate(type = "department_store")

# Supermarket
supermarket <- opq(bbox = 'El Paso, Texas') %>%
  add_osm_feature(key = 'shop', 
                  value = 'supermarket') %>%
  osmdata_sf()

supermarket_points <- supermarket$osm_points

supermarket_df <- as.data.frame(supermarket_points) %>%
  st_as_sf() %>%
  st_transform(crs=4269)

supermarket_points <- st_intersection(el_paso, supermarket_df) %>%
  dplyr::select(c("geometry")) %>%
  mutate(type = "supermarket")

# Mall
mall <- opq(bbox = 'El Paso, Texas') %>%
  add_osm_feature(key = 'shop', 
                  value = 'mall') %>%
  osmdata_sf()

mall_points <- mall$osm_points

mall_df <- as.data.frame(mall_points) %>%
  st_as_sf() %>%
  st_transform(crs=4269)

mall_points <- st_intersection(el_paso, mall_df) %>%
  dplyr::select(c("geometry")) %>%
  mutate(type = "mall")

osm_points <- rbind(
  hospital_points, 
      clinic_points,
      pharmacy_points,
      restaurant_points,
      cafe_points,
      bar_points,
      worship_points,
      cinema_points,
      theatre_points,
      department_store_points,
      supermarket_points,
      mall_points)

Feature Counts

We can see places of worship and restaurants are the most populous features, while malls and bars are the least populous.

osm_points_st <- osm_points %>%
  st_drop_geometry()

ggplot(osm_points_st, aes(y = type)) +
  geom_bar(fill = c(palette1)) +
  ggtitle("Count of Each Open Street Map Feature") +
  labs(subtitle="El Paso, TX",
       caption="Data: Open Street Map") +
  xlab("Count") +
  ylab("Type") +
  plotTheme

Feature Distributions

Below is a nearest neighbor distribution of each feature.

health <- rbind(hospital_points, clinic_points, pharmacy_points)
entertainment <- rbind(worship_points, cinema_points, theatre_points)
shop <- rbind(department_store_points, supermarket_points, mall_points)
food <- rbind(restaurant_points, cafe_points, bar_points)

# grid.arrange(
# ggplot()+
#   geom_sf(data=el_paso, fill="grey")+
#   geom_sf(data=health, aes(color=type))+
#   scale_color_manual(values=c(palette6[3:5]), name="Type")+
#   labs(title="Healthcare Facilities")+
#   mapTheme,
# ggplot()+
#   geom_sf(data=el_paso, fill="grey")+
#   geom_sf(data=entertainment, aes(color=type))+
#   scale_color_manual(values=c(palette6[3:5]), name="Type")+
#   labs(title="Entertainment Places")+
#   mapTheme,
# ggplot()+
#   geom_sf(data=el_paso, fill="grey")+
#   geom_sf(data=shop, aes(color=type))+
#   scale_color_manual(values=c(palette6[3:5]), name="Type")+
#   labs(title="Shopping Centers")+
#   mapTheme,
# ggplot()+
#   geom_sf(data=el_paso, fill="grey")+
#   geom_sf(data=food, aes(color=type))+
#   scale_color_manual(values=c(palette6[3:5]), name="Type")+
#   labs(title="Eateries")+
#   mapTheme,
# ncol=3)

grid.arrange(
ggplot()+
  geom_sf(data=hex_final, aes(fill=hospitals.nn1), color=NA)+
  labs(title="Distance to Nearest Hospital")+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  mapTheme3,
ggplot()+
  geom_sf(data=hex_final, aes(fill=clinics.nn1), color=NA)+
  labs(title="Distance to Nearest Clinic")+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  mapTheme3,
ggplot()+
  geom_sf(data=hex_final, aes(fill=pharmacies.nn1), color=NA)+
  labs(title="Distance to Nearest Pharmacy")+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  mapTheme3,
ggplot()+
  geom_sf(data=hex_final, aes(fill=restaurants.nn1), color=NA)+
  labs(title="Distance to Nearest Restaurant")+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  mapTheme3,
ggplot()+
  geom_sf(data=hex_final, aes(fill=cafes.nn1), color=NA)+
  labs(title="Distance to Nearest Cafe")+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  mapTheme3,
ggplot()+
  geom_sf(data=hex_final, aes(fill=bars.nn1), color=NA)+
  labs(title="Distance to Nearest Bar")+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  mapTheme3,
ggplot()+
  geom_sf(data=hex_final, aes(fill=worship_places.nn1), color=NA)+
  labs(title="Distance to Nearest Place of Worship")+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  mapTheme3,
ggplot()+
  geom_sf(data=hex_final, aes(fill=cinemas.nn1), color=NA)+
  labs(title="Distance to Nearest Cinema")+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  mapTheme3,
ggplot()+
  geom_sf(data=hex_final, aes(fill=theatres.nn1), color=NA)+
  labs(title="Distance to Nearest Theatre")+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  mapTheme3,
ggplot()+
  geom_sf(data=hex_final, aes(fill=department_stores.nn1), color=NA)+
  labs(title="Distance to Nearest Department Store")+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  mapTheme3,
ggplot()+
  geom_sf(data=hex_final, aes(fill=malls.nn1), color=NA)+
  labs(title="Distance to Nearest Mall")+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  mapTheme3,
ggplot()+
  geom_sf(data=hex_final, aes(fill=supermarkets.nn1), color=NA)+
  labs(title="Distance to Nearest Supermarket")+
  scale_fill_paletteer_c("grDevices::Red-Yellow", -1, name="Distance")+
  mapTheme3,
ncol=3)

Modeling

bus_routes <- bus_routes %>%
  st_transform(crs=4269)

flum_crop <- flum %>%
  st_transform(crs=4269) %>%
  st_crop(y = st_bbox(bus_routes)) 


excludeLU <- c("Fort Bliss Military",
               #"Fort Bliss Mixed Use (Airport)",
               #"Preserve",
               #"Industrial and/or Railyards",
               "Remote",
               "Military Reserve"
)

exclude_area <- flum_crop %>% 
  filter(COMMENTS %in% excludeLU) %>% 
  st_union()

franklin <- franklin %>%
  st_union() %>% st_as_sf() %>% st_transform(crs=4269) %>% st_buffer(dist=0.0005)

airport <- flum_crop %>% filter(OBJECTID == 1540 | COMMENTS  == "Fort Bliss Military" & OBJECTID != 2189) %>% st_buffer(.001) %>% st_union()
  
  
el_paso <- el_paso %>%
  st_transform(crs=4269)

elpaso_outline <- el_paso %>% 
  st_union() %>% 
  st_sf() %>% 
  st_crop(y = st_bbox(bus_routes)) %>% 
  st_difference(franklin) %>%
  st_difference(exclude_area) %>% 
  st_difference(airport)

hex <- st_make_grid(elpaso_outline, 
                    cellsize = .005, 
                    crs = 4269, 
                    square = F) %>% 
  st_sf() 

hex <- hex[elpaso_outline,] %>%
  mutate(uniqueID = as.numeric(rownames(.)))


ggplot()+
  geom_sf(data=el_paso, fill='grey')+
  geom_sf(data=hex, fill=c(palette1), alpha=0.7)+
  labs(title="Hex Bin Grid - El Paso, Texas")+
  mapTheme

Aggregating data to hexbin level

We aggregated the data from each source to the hexbin level. The method to do this varied based on the source data’s geometry type, and if the variable was an absolute number, instead of a percent or mean.

riderstops_sf <- riderstops %>%
  filter(!is.na(stop_lat)) %>%
  st_as_sf(coords = c('stop_lon', 'stop_lat'), crs = 4269) 

stop_riders_agg <- riderstops_sf %>% 
  group_by(TP, RT) %>% 
  summarise(ridership = sum(Ons) + sum(Offs))

stop_riders_agg_noBRT <- stop_riders_agg %>% 
  filter(! RT %in% c(205,206,207,208))
stop_riders_agg_BRT <- stop_riders_agg %>% 
  filter(RT %in% c(205,206,207,208))

ridership_net_local <- stop_riders_agg %>% # Local Bus RT ridership
  dplyr::select(ridership) %>% 
  aggregate(., hex, sum) %>%
  mutate(ridership = replace_na(ridership, 0),
         uniqueID = as.numeric(rownames(.)))

ridership_net_all <- stop_riders_agg %>% # BRIO RT Ridership
  dplyr::select(ridership) %>% 
  aggregate(., hex, sum) %>%
  mutate(ridership = replace_na(ridership, 0),
         uniqueID =  as.numeric(rownames(.)))

ridership_net_BRT <- stop_riders_agg_BRT %>%  # Total Ridership
  dplyr::select(ridership) %>% 
  aggregate(., hex, sum) %>%
  mutate(ridership = replace_na(ridership, 0),
         uniqueID =  as.numeric(rownames(.)))

# p1a <- ridership_net_local %>%
#   ggplot()+
#   geom_sf(aes(fill = ridership), color =NA)+
#   paletteer::scale_fill_paletteer_c("grDevices::Red-Yellow", -1, labels= comma)+
#   labs(fill = "Local Ridership")+
#   theme(legend.position = 'right')+
#   mapTheme
# 
# 
# p1b <- ridership_net_all %>%
#   ggplot()+
#   geom_sf(aes(fill = ridership), color =NA)+
#   paletteer::scale_fill_paletteer_c("grDevices::Red-Yellow", -1, labels= comma)+
#   labs(fill = "All Ridership")+
#   theme(legend.position = 'right')+
#   mapTheme
# 
# p1c <- ridership_net_BRT %>%
#   ggplot()+
#   geom_sf(aes(fill = ridership), color =NA)+
#   paletteer::scale_fill_paletteer_c("grDevices::Red-Yellow", -1, labels= comma)+
#   labs(fill = "BRT Ridership")+
#   theme(legend.position = 'right')+
#   mapTheme
# 
# plot_grid(p1b,p1a,p1c, ncol=3)
#

# ep_work_census <- ep_work_census %>%
#   dplyr::select(-c("GEOID", "NAME", "OBJECTID", "NAME.1", "GlobalID", "Shape_Length", "Shape_Area",
#                    "year", "state", "age_29_or_younger", "age_30_to_54", "age_55_or_older", "monthly_income_1250_or_less", "monthly_income_1251_to_3333", "monthly_income_3334_or_more", "agriculture", "mining", "utilities", "construction", "manufacturing", "wholesaleTrade", "retailTrade", "transportationWarehousing", "informationTech", "financeInsurance", "realEstate", "techServices", "management", "wasteRemediation", "educational", "healthcareSocial", "artsEntertainmentRec", "foodServices", "otherJobs", "publicAdmin", "white_work", "black_work", "native_american_work", "asian_work", "pacific_work", "mixed_race_work", "not_hispanic_work", "hispanic_work", "underHS_work", "HS_work", "associate_work", "bachProfessional_work", "male_work", "female_work"))
# 
# ep_work_census <- ep_work_census %>%
#   st_transform(crs=4269)





medHHInc_hex <-  acs_ep %>% dplyr::select(medHHInc) %>% st_interpolate_aw(hex, extensive = F) %>%
  mutate(uniqueID = as.numeric(rownames(.))) 

acs_ep_hex <- left_join(acs_ep_hex, medHHInc_hex %>%  st_drop_geometry(), by = 'uniqueID')

acs_race_hex <- acs_ep %>% dplyr::select(acs_extf_cols) %>% st_interpolate_aw(hex, extensive = F) %>%
  mutate(uniqueID = as.numeric(rownames(.)))
## add pct pop vars in seperataley
  
tx_work_hex <- lehd_ep %>% st_interpolate_aw(hex, extensive = F) %>%
  mutate(uniqueID =  as.numeric(rownames(.)))

nwi_bg <- nwi_bg %>%
  st_transform(crs=4269) %>%
  mutate(nearestTransit = ifelse(nearestTransit == -99999.00, -1, nearestTransit),
         destinationAccessibility = ifelse(destinationAccessibility == -99999, 0, destinationAccessibility)) 


nwi_ext <- c("housingUnits","HH","jobAccessibility", "destinationAccessibility" )

nwi_hex1 <- st_interpolate_aw(nwi_bg %>% dplyr::select(nwi_ext), hex, extensive = T) %>%
  mutate(uniqueID = as.numeric(rownames(.)))

nwi_hex2 <- st_interpolate_aw(nwi_bg %>% dplyr::select(-nwi_ext), hex, extensive = F) %>%
  mutate(uniqueID = as.numeric(rownames(.)))


nwi_hex <- left_join(nwi_hex1, nwi_hex2 %>% st_drop_geometry()) %>% st_sf()

final_hex <- left_join(ridership_net_local, st_drop_geometry(acs_ep_hex), by = "uniqueID") %>% 
  left_join(st_drop_geometry(tx_work_hex), by = "uniqueID") %>% 
  left_join(st_drop_geometry(nwi_hex), by = "uniqueID") %>% 
  replace(is.na(.), 0)


final_hex$uniqueID <- as.numeric(final_hex$uniqueID)

schools_hex <- schools %>%
  st_transform(crs=4269) %>%
  dplyr::select(c("geometry")) %>%
  mutate(type = "school")

parks_hex <- parks %>%
  st_transform(crs=4269) %>%
  dplyr::select(c("geometry")) %>%
  mutate(type = "park")

major_hex <- major %>%
  dplyr::select(c("geometry")) %>%
  mutate(type="major_road") %>%
  st_transform(crs=4269)

vars_net <- 
  rbind(
    schools_hex,
    parks_hex,
    major_hex,
    hospital_points, 
    clinic_points,
    pharmacy_points,
    restaurant_points,
    cafe_points,      
    bar_points,
    worship_points,
    cinema_points,
    theatre_points,
    department_store_points,
    supermarket_points,
    mall_points
) %>%
  st_join(., hex, join=st_within) %>%
  st_drop_geometry() %>%
  group_by(uniqueID, type) %>%
  summarize(count = n()) %>%
    full_join(hex) %>%
    spread(type, count, fill=0) %>%
    st_sf() %>%
    dplyr::select(-`<NA>`) %>%
    na.omit() %>%
    ungroup()

st_c <- st_coordinates
st_coid <- st_centroid

vars_net <-
  vars_net %>%
    mutate(
      schools.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(schools_hex),1),
      schools.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(schools_hex),3),
      schools.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(schools_hex),5),
      parks.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(st_coid(parks_hex)),1),
      parks.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(st_coid(parks_hex)),3),
      parks.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(st_coid(parks_hex)),5),
      hospitals.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(hospital_points),1),
      hospitals.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(hospital_points),3),
      hospitals.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(hospital_points),5),
      clinics.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(clinic_points),1),
      clinics.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(clinic_points),3),
      clinics.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(clinic_points),5),
      pharmacies.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(pharmacy_points),1),
      pharmacies.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(pharmacy_points),3),
      pharmacies.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(pharmacy_points),5),
      restaurants.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(restaurant_points),1),
      restaurants.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(restaurant_points),3),
      restaurants.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(restaurant_points),5),
      cafes.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(cafe_points),1),
      cafes.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(cafe_points),3),
      cafes.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(cafe_points),5),
      bars.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(bar_points),1),
      bars.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(bar_points),3),
      bars.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(bar_points),5),
      worship_places.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(worship_points),1),
      worship_places.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(worship_points),3),
      worship_places.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(worship_points),5),
      cinemas.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(cinema_points),1),
      cinemas.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(cinema_points),3),
      cinemas.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(cinema_points),5),
      theatres.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(theatre_points),1),
      theatres.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(theatre_points),3),
      theatres.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(theatre_points),5),
      department_stores.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(department_store_points),1),
      department_stores.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(department_store_points),3),
      department_stores.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(department_store_points),5),
      supermarkets.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(supermarket_points),1),
      supermarkets.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(supermarket_points),3),
      supermarkets.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(supermarket_points),5),
      malls.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(mall_points),1),
      malls.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(mall_points),3),
      malls.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(mall_points),5),
      major_road.nn1 =
        nn_function(st_c(st_coid(vars_net)), st_c(st_coid(major_hex)),1),
      major_road.nn3 =
        nn_function(st_c(st_coid(vars_net)), st_c(st_coid(major_hex)),3),
      major_road.nn5 =
        nn_function(st_c(st_coid(vars_net)), st_c(st_coid(major_hex)),5)
    )

vars_net <- vars_net %>%
  st_drop_geometry()

final_hex <- merge(final_hex, vars_net, by = "uniqueID")

# add features for # of bus stops in hex, # of transit bay bus stops
busstopnet <- hex %>% st_join(stop_riders_agg_noBRT %>% mutate(stop1 = 1)) %>% mutate(stop1 = replace_na(stop1, 0)) %>% group_by(uniqueID) %>% summarise(stops_in_hex = sum(stop1))

busbaysnet <- hex %>% st_join(local_transit_bays %>% mutate(stop1 = 1)) %>% mutate(stop1 = replace_na(stop1, 0)) %>% group_by(uniqueID) %>% summarise(bays_in_hex = sum(stop1))

final_hex <- merge(final_hex, busstopnet %>% st_drop_geometry(), by = "uniqueID")

final_hex <- merge(final_hex, busbaysnet %>% st_drop_geometry(), by = "uniqueID") 

final_hex <- final_hex %>% dplyr::select(-contains('CF'))


### remove hexs with bus bays
bay_hex <- final_hex %>% filter(bays_in_hex > 0)

final_hex <- final_hex %>% filter(bays_in_hex == 0) %>% dplyr::select(-bays_in_hex)

We model the ridership of 2174 hexagons using the 187 features we collected. In order to make the model more generalizable over the parts of El Paso with lower ridership, we remove hexs that contain a bus bay. The hexs are all outliers in ridership, and because transit planner already know to route to them, they are less relevant to our predictions.

ggplot(bay_hex)+
  geom_sf(fill = 'black', color =NA)+
  geom_sf(data= final_hex, aes(fill = ridership), color = NA)+
  paletteer::scale_fill_paletteer_c("grDevices::Red-Yellow", -1, labels= comma)+
  mapTheme+
  labs(title = "El Paso Local Bus Ridership per Hex", subtitle = "Hexes with transit bays removed", fill ='')

Correlation Matrix

Below is a correlation matrix of several variables used in our final model.

# numericVars <- hex_final %>%
#   dplyr::select(-c("uniqueID", "ridership", "cvID", "prediction", "error", "geometry")) %>% st_drop_geometry() %>% na.omit()
# 
# numericVars2 <- numericVars %>%
#   dplyr::select(1,2,3,4,6,10,11,12,15,23,25,26,28,29,31,67,71,72,73,74,78,79,81,82,83,98,101,104,107,110,113,116,119,122,125,128,131,134,137,140)
# 
# numericVarsscale <- scale(numericVars2)
# 
# numericVarsscale[numericVarsscale < -1] <- -1
# numericVarsscale[numericVarsscale > 1] <- 1
# # Use the cor() function to calculate the correlation matrix
# correlation_matrix <- cor(numericVarsscale)
# 
# library(GPfit)
# 
# corrplot(correlation_matrix,
#   method = "color",
#   #title = "Correlation Matrix",
#   type="upper",
#   col = colorRampPalette(c(palette5cont[5], "white", palette6[4]))(100),
#   # type="lower",
#   # insig = "blank") +  
#   #   labs(title = "Correlation across numeric variables")+
#   # # guides(fill=guide_legend(title="Correlation"))+
#   # plotTheme()+
#   # theme(axis.text.x = element_text(angle=90))
# )

Correlation Matrix of Selected Model Variables

Modeling

We created a 25/75 test/train split with our input dataset and tried it on numerous model types. XG boost was the most accurate in terms of mean absolute error (MAE), so we selected it to optimize further.

Our final model was set at 100 trees and has a MAE of 275.

set.seed(13)



input <- final_hex  %>% 
  st_drop_geometry() %>% 
  mutate(cvID = sample(round(nrow(final_hex) / 24), 
                       size=nrow(final_hex), 
                       replace = TRUE) )

### Initial Split for Training and Test
data_split <- initial_split(input, strata = "ridership", prop = 0.75)
ep_train <- training(data_split)
ep_test  <- testing(data_split)


### Cross Validation
## LOGOCV on Neighborhood with group_vfold_cv()
cv_splits_geo <- group_vfold_cv(ep_train,  
                                group = "cvID")
#print(cv_splits_geo)

### Create Recipes
model_rec <- recipe(ridership ~ ., data = ep_train) %>%
  update_role(cvID, new_role = "cvID") %>%
  update_role(uniqueID, new_role = "uniqueID") %>%
  step_log(ridership, skip = TRUE, offset = 1) %>%
  step_center(all_predictors(), -ridership) %>%
  step_scale(all_predictors(), -ridership) 

#%>%  step_dummy(mega_bay)

# %>% step_ns(lat, lon, options = list(df = 4))

#tidy model binary

  #step_ns(Latitude, Longitude, options = list(df = 2))

# See the data after all transformations
#glimpse(model_rec %>% prep() %>% juice())

# rf_plan <- rand_forest() %>%
#   set_args(mtry  = tune()) %>%
#   set_args(min_n = tune()) %>%
#   set_args(trees = 2000) %>% 
#   set_engine("ranger", importance = "impurity") %>% 
#   set_mode("regression")

XGB_plan <- boost_tree() %>%
  set_args(mtry  = tune()) %>%
  set_args(min_n = tune()) %>%
  set_args(trees = 100) %>% 
  set_engine("xgboost") %>% 
  set_mode("regression")


# Hyperparameter grid for glmnet (penalization)
rf_grid <- expand.grid(mtry = seq(10,100,20), 
                       min_n = c(10,30,60))
xgb_grid <- expand.grid(mtry = seq(10,120,20), 
                       min_n = c(10,30,60))


# create workflow
# rf_wf <-
#   workflow() %>% 
#   add_recipe(model_rec) %>% 
#   add_model(rf_plan)

xgb_wf <-
  workflow() %>% 
  add_recipe(model_rec) %>% 
  add_model(XGB_plan)

control <- control_resamples(save_pred = TRUE, verbose = TRUE)
metrics <- metric_set(rmse, rsq, mape, smape, mae)

# fit model to workflow and calculate metrics


all_cores <- parallel::detectCores(logical = FALSE)

cl <- makePSOCKcluster(all_cores)
registerDoParallel(cl)


# rf_tuned <- rf_wf %>%
#   tune::tune_grid(.,
#                   resamples = cv_splits_geo,
#                   grid      = rf_grid,
#                   control   = control,
#                   metrics   = metrics)

xgb_tuned <- xgb_wf %>%
  tune::tune_grid(.,
                  resamples = cv_splits_geo,
                  grid      = xgb_grid,
                  control   = control,
                  metrics   = metrics)

## 'Best' by some metric and margin

# rf_best_params     <- select_best(rf_tuned, metric = "rmse")
xgb_best_params    <- select_best(xgb_tuned, metric = "rmse")

## Final workflow
# rf_best_wf     <- finalize_workflow(rf_wf, rf_best_params)
xgb_best_wf    <- finalize_workflow(xgb_wf, xgb_best_params)

# last_fit() emulates the process where, after determining the best model, the final fit on the entire training set is needed and is then evaluated on the test set.

# rf_val_fit_geo <- rf_best_wf %>% 
#   last_fit(split     = data_split,
#            control   = control,
#            metrics   = metrics)

xgb_val_fit_geo <- xgb_best_wf %>% 
  last_fit(split     = data_split,
           control   = control,
           metrics   = metrics)

# #random forest
# full_fit_rf <- rf_best_wf %>% fit(input)
# 
# output_rf <- input
# 
# output_rf$pred <- predict(full_fit_rf, new_data = input) %>% exp()
# 
# output_rf <- output_rf  %>% 
#   mutate(error = pred$.pred - ridership,
#          uniqueID = as.character(uniqueID))
# 
# 
# output_rf_sf <- left_join(output_rf, hex) %>% st_sf()
# 
# p1<-ggplot(output_rf_sf)+
#   geom_sf(aes(fill = error), color =NA)
# 
# 
# 
# 
# p2<-full_fit_rf %>% 
#   extract_fit_parsnip() %>% 
#   vip(num_features = 30)
# grid.arrange(p1,p2, ncol = 2)

#random forest
full_fit_xgb <-xgb_best_wf %>% fit(input)

output_xgb <- input

output_xgb$pred <- predict(full_fit_xgb, new_data = input) %>% exp() 

output_xgb <- output_xgb  %>% 
  mutate(prediction= pred$.pred,
         error = pred$.pred - ridership)


output_xgb_sf <- left_join(output_xgb, hex_final %>% dplyr::select(uniqueID)) %>% st_sf()

#st_write(output_xgb, 'xgb_output.geojson', overwrite = T, append = F)

#predicted ridership

ggplot(output_xgb_sf)+
  geom_sf(aes(fill = prediction), color =NA)+
  paletteer::scale_fill_paletteer_c("grDevices::Red-Yellow", -1, labels= comma)+
  labs(title = "XG Boost Ridership Prediction", fill = '')+
  theme(legend.position = 'right')+
  mapTheme3

p1<-ggplot(output_xgb_sf)+
  geom_sf(aes(fill = error), color =NA)+
  
   scale_fill_gradientn(
    colours = colorRampPalette(rev(RColorBrewer::brewer.pal(11, "RdBu")))(255),
    values = c(1.0, (0 - min(output_xgb_sf$error)) / (max(output_xgb_sf$error) - min(output_xgb_sf$error)), 0)
  )+
# paletteer::scale_fill_paletteer_c("grDevices::Red-Yellow", -1, labels= comma)+
  labs(fill = "XG Boost Model Error")+
  theme(legend.position = 'left')+
  mapTheme3

  


p2<-full_fit_xgb %>% 
  extract_fit_parsnip() %>% 
  vip(num_features = 15)

grid.arrange(ncol = 1, p1,p2)

This map shows where our model erred in its predictions. It over-predicted in a few downtown hexs and under-predicted along Brio routes. Obviously, the amount of stops in the hex was the most predictive. This shows the next 14 most predictive variables.

xgb_mae <- sum(abs(output_xgb$error))/nrow(output_xgb)


#rf_mae <- sum(abs(output_rf$error))/nrow(output_rf)

models_mae <- data.frame(model = c("XG Boost"),
                         mae = c(xgb_mae))
library(gt)
models_mae %>% gt()

model	mae
XG Boost	241.6001

# read in final dataset

#final_data <- read_sf("final_hex3.geojson")
# final_data <- left_join(output_xgb_sf, acs_race_hex %>% st_drop_geometry(), by = 'uniqueID')
# 
# final_data <- final_data %>% mutate(
#   ridership_per_stop = ifelse(stops_in_hex != 0, ridership / stops_in_hex, 0),
#   pred_ridership_per_stop = ifelse(stops_in_hex != 0, prediction / stops_in_hex, 0))
# 
# 
# 
# app_cols <- c('uniqueID', 'ridership', 'prediction', 'ridership_per_stop','pred_ridership_per_stop','whitePopPct' ,
# 'blackPopPct','asianPopPct','hlPopPct', 'otherRacePopPct',
# 'nhPopPct','aiPopPct','disability','medHHInc','employmentHHMix')
# 
# 
# final_data <- final_data %>% dplyr::select(app_cols)



#st_write(final_data, 'THE_FINAL_HEX.geojson', overwrite = T, append = F)

Generalizability

Tu understand the generalizability of our model, we split our hex grid into 5 groups based on total population and calculated the MAE of each group. Lowest population hexs had the smallest MAE, while the 2nd most populous group had a higher MAE than the most populous group.

output_xgb_cv <- output_xgb_sf %>% mutate(totalpop_q5 = q5(totalPop))

val_MAPE_by_hood <- output_xgb_cv %>% 
  group_by(totalpop_q5) %>% 
  summarise(RMSE = yardstick::rmse_vec(ridership, prediction),
         MAE  = yardstick::mae_vec(ridership, prediction),
         MAPE = yardstick::mape_vec(ridership, prediction)) %>% 
  ungroup() 

grid.arrange(
  ggplot(val_MAPE_by_hood, aes(x = totalpop_q5, fill = as.numeric(totalpop_q5), y = MAE)) +
  geom_bar(stat = "identity") + 
  paletteer::scale_fill_paletteer_c("grDevices::Red-Yellow", -1, labels= comma)+
  plotTheme+
  labs( x ='Population Quintile', y = 'MAE')+
  theme(legend.position = 'none'),
ggplot(output_xgb_cv)+
  geom_sf(aes(fill = as.numeric(totalpop_q5)), color =NA)+ 
  paletteer::scale_fill_paletteer_c("grDevices::Red-Yellow", -1, labels= comma)+
  mapTheme+
  theme(legend.position = 'none'),
ncol = 2
)

Application: Sun Metro Optimizer

After finalizing our modeling process, we transferred our hexbin ridership predictions into our web app, Sun Metro Optimizer. The Sun Metro Optimizer is a web application that can be navigated to see existing route-level ridership and allow for users (Sun Metro transit planners) to upload new routes and assess their ridership and equity indicators in comparison to existing routes.

Sun Metro Optimizer

When the planner opens the application, they have three different options to navigate between. The “predict ridership” tab allows users to see the actual and predicted ridership of each hex to understand potential under- or over-provision of transit services. The “current ridership” tab allows users to select individual existing routes to assess their average ridership, with the size of circles indicating ridership (greater the size, greater the ridership). The “import routes” tab allows users to upload a geojson file of bus stops, and a back-end aggregation function calculates predicted ridership of the route, and can also be compared to the other existing route’s ridership.

Sun Metro Optimizer

Throughout this process, a dashboard at the bottom of the screen will populate with metrics divided as “revenue” and “equity.” As aforementioned, we judged revenue optimization as optimizing route-level ridership. We judged equity on a few of our model variables, aggregated based on the hexes each route crosses into. Specifically, we utilized average racial distribution, median household income, sum of disabled people, and the balance of job opportunities to occupied housing units. This dashboard helps the planner better understand which routes are currently optimizing for these metrics, and how new routes can feed into the system while also optimizing for these metrics.

Sun Metro Optimizer

Because we are assessing primarily at the route and hexbin level, it is hard to see how new routes connect to existing routes and optimize these metrics for the system at large. However, this framework is just a proof of concept demonstration of how new routes can be graded against existing routes, and system-wide impacts would require deeper analysis once more efficiently drafted routes are created.

With this application, we hope Sun Metro transit planners better understand how new bus transit scenarios optimize maximal ridership and a more equitable and accessible distribution of services for all populations.

Conclusion

Sun Metro Optimizer aims to use our data-driven, politically unbiased, and equity and revenue optimizing model to understand improved bus transit route allocation. With accurate forecasts of ridership trends, Sun Metro transit planners can see areas that may be underserved by transit, and ensure that whilst they reach these regions, they optimize routes for ridership and equity.