The data for this project come from Lab #6 of the course Archives, Data, and Analysis (DCS204). The data consist of transcriptions from the treasurer’s ledger of the Maine State Seminary from 1855 to 1857.
Because these data have been transcribed from 19th-century cursive handwriting, some transcription errors can be expected. This is particularly true of the “Location” column, which records the city or town the donation came from. For the purposes of this project, any misspellings have not been corrected or changed, but misspellings have been included in the list of towns in each county in order to correctly sort all donations into their proper county. For example, the town of Harrison in Cumberland county is frequently spelling “Harison” in the data, so “Harison” is included in the list of towns in Cumberland county.
Prior to county identification, the data have been sorted into Maine and Non-Maine through the addition of an indicator column called is_not_maine. This column will have a 1 if the donation is from outside of Maine; this is determined by whether the sub-strings “nh”, “n.h”, or “ma” are present in each location value. Of course, there are some town names in Maine that contain these sub-strings as well, such as Bowdoinham and Madison. This has been corrected by identifying those towns and overwriting the is_not_maine value for each row containing one of these towns to be 0.
Some additional dataframes were produced in the process of this project. First, using the is_not_maine variable, the original mss_donors dataframe was separated into mss_donors_maine and mss_donors_notmaine. Each of these dataframes were used to make a table showing the distribution of donation amounts (more on this in the Analysis section). The Maine dataframe was also used to create a table summarizing the total sum and count of donations for each county.
The county summary table was used in conjunction with county spatial data from the Maine GeoLibrary data catalog to create a choropleth map of total donations by county. The spatial data was so high-resolution that this map was taking far too long to load, so the function rmapshaper::ms_simplify was used to simplify the shapes. The county data were used to determine the fill colors of each county.
Data preparation script
# Load packages
library(dplyr)
library(ggplot2)
library(rmapshaper)
library(sf)
# Preparing the mss_donors dataframe
# Making an indicator variable for whether each donation came from Maine
mss_donors$is_not_maine <- 0 # initialzing a new column for this indicator
# iterating through the dataframe
# will make value for is_not_maine 1 for all donations where the location contains "nh", "n.h", or "ma"
for(i in 1:nrow(mss_donors)){
if(grepl(pattern = "nh", mss_donors$Location[i]) == TRUE){
mss_donors$is_not_maine[i] <- 1
}
if(grepl(pattern = "n\\.h", mss_donors$Location[i]) == TRUE){
mss_donors$is_not_maine[i] <- 1
}
if(grepl(pattern = "ma", mss_donors$Location[i]) == TRUE){
mss_donors$is_not_maine[i] <- 1
} # fixes errors - some Maine towns contain "ma" or "nh" and are wrongly identified as non-Maine
if(mss_donors$Location[i] == "madison"|
mss_donors$Location[i] == "burnham"|
mss_donors$Location[i] == "bowdoinham"|
mss_donors$Location[i] == "maine"|
mss_donors$Location[i] == "madison") {
mss_donors$is_not_maine[i] <- 0
}
}
# removes rows where Amount is missing
mss_donors <- mss_donors[is.na(mss_donors$Amount) == FALSE,]
# making a dataframe of only non-Maine donations
mss_donors_notmaine <- mss_donors[mss_donors$is_not_maine == 1,]
# making a dataframe of only Maine donations
mss_donors_maine <- mss_donors[mss_donors$is_not_maine == 0,]
# adding county info to mss_donors_maine
# lists of all towns in each county
# these lists contain misspellings and alternate spellings found in the data set
androscoggin <- c("auburn", "durham", "greene", "leeds", "lewiston", "lisbon", "livermore",
"livermore falls", "mechanic falls", "minot", "poland", "sabattus", "turner",
"wales")
aroostook <- c("caribou", "presque isle", "allagash", "amity", "ashland", "bancroft", "blaine",
"bridgewater", "castle hill", "caswell", "chapman", "crystal", "dyer brook", "eagle lake",
"easton", "fort fairfield", "fort kent", "frenchville", "grand isle",
"hamlin", "hammond", "haynesville", "hersey", "hodgdon", "houlton", "island falls",
"limestone", "linneus", "littleton", "ludlow", "madawaska", "mapleton", "mars hill", "masardis",
"merrill", "monticello", "new canada", "new limerick", "new sweden", "oakfield", "orient", "perham",
"portage lake", "saint agatha", "saint francis", "sherman", "smyrna", "stockholm", "van buren", "wade", "washburn", "westfield", "westmanland", "weston", "woodland")
cumberland <- c("baldwin", "bridgton", "brunswick", "south brunswick",
"cape elizabeth", "casco", "chebeague island", "cumberland",
"falmouth", "freeport", "frye island", "gorham", "goham", "gray", "harpswell", "south harpswell", "harrison", "harison", "long island", "naples", "new gloucester", "north yarmouth", "portland", "pownal", "raymond", "scarborough", "scarboro", "sebago", "sabago", "south portland", "standish", "westbrook", "windham", "yarmouth")
franklin <- c("avon", "carrabassett valley", "carthage", "chesterville", "eustis", "farmington", "industry", "jay",
"kingfield", "madrid", "new sharon", "new vineyard", "phillips", "south phillips", "rangeley", "strong", "temple", "weld", "wilton", "witon")
hancock <- c("ellsweorth", "amherst", "aurora", "bar harbor", "blue hill", "brooklin", "brooksville", "bucksport", "castine",
"cranberry isles", "dedham", "deer isle", "eastbrook", "franklin", "frenchboro", "gouldsboro", "great pond", "hancock",
"lamoine", "mariaville", "mount desert", "orland", "osborn", "otis", "penobscot", "sedgwick", "sorrentoo", "southwest harbor",
"stonington", "sullivan", "surry", "swans island", "tremont", "trenton", "verona", "waltham", "winter harbor")
kennebec <- c("albion", "augusta", "belgrade", "benton", "chelsea", "china", "clinton", "farmingdale", "fayette", "gardiner", "hallowell", "lichfield", "manchester", "monmouth", "mount vernon", "oakland", "pittston", "randolph", "readfield", "rome", "sidney", "sidney ", "vassalboro", "vienna", "waterville", "waterville ", "w. waterville", "wayne", "west gardiner", "windsor", "winslow", "winthrop")
knox <- c("appleton", "camden", "camden ", "cushing", "friendship", "hope", "isle au haut", "north haven",
"owls head", "rockport", "st. george", "south thomaston", "thomaston", "union", "vinalhaven", "warren", "washington", "rockland")
lincoln <- c("alna", "boothbay", "boothbay ", "boothbay harbor", "bremen", "bristol", "damariscotta",
"dresden", "edgecomb", "edgeconch", "jefferson", "newcastle", "nobleboro", "somerville",
"south bristol", "southport", "waldoboro", "westport", "whitefield", "wiscasset", "monhegan")
oxford <- c("andover", "bethel", "brownfield", "buckfield", "byron", "canton", "denmark",
"dixfield", "fryeburg", "gilead", "greenwood", "hanover", "hartford", "hebron", "mt. hebron",
"hiram", "lovell", "mexico", "newry", "norway", "otisfield", "oxford", "paris",
"peru", "porter", "roxbury", "rumford", "stoneham", "stowe", "sumner", "sweden",
"upton", "waterford", "west paris", "woodstock")
penobscot <- c("bangor", "brewer", "old town", "alton", "bradford", "burlington", "carmel", "charleston", "chester", "clifton",
"corinna", "corinth", "east corinth", "dexter", "dixmont", "dixmount", "east dixmont", "east dixmount", "e. dixmont", "east millinocket", "eddington", "enfield", "etna", "exeter", "garland",
"glenburn", "greenbush", "greenfield", "hampden", "hermon", "holden", "howland", "hudson", "indian island", "kenduskeag", "lagrange", "lakeville", "lee", "levant", "lincoln", "lowell", "mattawamkeag", "maxfield", "medway", "milford",
"millinocket", "mount chase", "newburgh", "newport", "orono", "orrington", "passadumkeag", "patten", "plymouth",
"springfield", "stacyville", "stetson", "veazie", "winn", "woodville")
piscataquis <- c("abbott", "atkinson", "south atkinson", "beaver cove", "bowerbank", "brownville", "dover-foxcroft", "greenville", "guilford", "medford", "milo", "monson", "parkman", "sangerville", "sebec", "shirley", "wellington",
"willimantic")
sagadahoc <- c("bath", "arrowsic", "bowdoin", "bowdoinham", "georgetown", "phippsburg", "richmond", "topsham", "west bath", "woolwich")
somerset <- c("anson", "athens", "bingham", "cambridge", "canaan", "cornville", "detroit", "embden", "fairfield",
"harmony", "hartland", "jackman", "madison", "mercer", "moose river", "moscow", "new portland", "norridgewock", "palmyra", "pittsfield", "ripley", "saint albans", "skowhegan", "smithfield", "solon", "starks")
waldo <- c("belfast", "belmont", "brooks", "burnham", "frankfort", "islesboro", "jackson", "knox", "liberty", "lincolnville",
"monroe", "montville", "morrill", "northport", "palermo", "prospect", "searsmont", "searsport", "stockton springs",
"swanville", "thorndike", "troy", "unity", "waldo", "winterport")
washington <- c("calais", "eastport", "addison", "alexander", "baileyville", "beals", "beddington", "centerville", "charlotte", "cherryfield", "columbia", "columbia falls", "cooper", "crawford", "cutler", "danforth", "deblois", "dennysville", "east machias", "harrington", "indian township", "jonesboro", "jonesport", "lubec", "machias", "machiasport", "marshfield", "meddybemps", "milbridge", "northfield", "pembroke", "perry", "princeton",
"robbinston", "roque bluffs", "steuben", "talmadge", "topsfield", "vanceboro", "waite", "wesley", "whiting", "whitneyville")
york <- c("biddeford", "bidderford", "saco", "acton", "alfred", "arundel", "berwick", "buxton", "cornis",
"dayton", "eliot", "hollis", "kennebunk", "kennebunkport", "kittery", "lebanon",
"limerick", "limington", "lyman", "newfield", "north berwick", "n. berwick", "ogunquit", "old orchard beach", "parsonsfield", "sanford", "shapleigh", "south berwick", "waterboro",
"wells", "york")
# assigning counties to each donation
# initializing new column
mss_donors_maine$county <- " "
# getting rid of rows where Location is missing
mss_donors_maine <- mss_donors_maine[mss_donors_maine$Location != "",]
# iterating through to match towns to counties
for(i in 1:nrow(mss_donors_maine)){
if(mss_donors_maine$Location[i] %in% androscoggin){
mss_donors_maine$county[i] <- "Androscoggin"
} else if(mss_donors_maine$Location[i] %in% aroostook){
mss_donors_maine$county[i] <- "Aroostook"
} else if(mss_donors_maine$Location[i] %in% cumberland){
mss_donors_maine$county[i] <- "Cumberland"
} else if(mss_donors_maine$Location[i] %in% franklin){
mss_donors_maine$county[i] <- "Franklin"
} else if(mss_donors_maine$Location[i] %in% hancock){
mss_donors_maine$county[i] <- "Hancock"
} else if(mss_donors_maine$Location[i] %in% kennebec){
mss_donors_maine$county[i] <- "Kennebec"
} else if(mss_donors_maine$Location[i] %in% knox){
mss_donors_maine$county[i] <- "Knox"
} else if(mss_donors_maine$Location[i] %in% lincoln){
mss_donors_maine$county[i] <- "Lincoln"
} else if(mss_donors_maine$Location[i] %in% oxford){
mss_donors_maine$county[i] <- "Oxford"
} else if(mss_donors_maine$Location[i] %in% penobscot){
mss_donors_maine$county[i] <- "Penobscot"
} else if(mss_donors_maine$Location[i] %in% piscataquis){
mss_donors_maine$county[i] <- "Piscataquis"
} else if(mss_donors_maine$Location[i] %in% sagadahoc){
mss_donors_maine$county[i] <- "Sagadahoc"
} else if(mss_donors_maine$Location[i] %in% somerset){
mss_donors_maine$county[i] <- "Somerset"
} else if(mss_donors_maine$Location[i] %in% waldo){
mss_donors_maine$county[i] <- "Waldo"
} else if(mss_donors_maine$Location[i] %in% washington){
mss_donors_maine$county[i] <- "Washington"
} else if(mss_donors_maine$Location[i] %in% york){
mss_donors_maine$county[i] <- "York"
}
}
# preparing spatial data for ggplot
# loading in the Maine county boundaries shape file from the Maine GeoLibrary data catalog
counties_shape <- read_sf("Maine_County_Boundary_Polygons_Feature")
# using rmapshaper::ms_simplify to simplify the geometries in the shapefile --> reduces loading time of the ggplot map
counties_simplified <-
ms_simplify(counties_shape,
keep = 0.01,
keep_shapes = FALSE)
# creating a dataframe for total donation amounts from each county
county_totals <- as.data.frame(matrix(nrow = 16, ncol = 3, data = 0))
colnames(county_totals) <- c("county", "total", "num_donations")
county_totals$county <- maine_counties
# populating the new county_totals df
for(i in 1:16){
current_county <- maine_counties[i]
for(j in 1:nrow(mss_donors_maine)){
if(mss_donors_maine$county[j] == current_county){
county_totals$total[i] <- county_totals$total[i] + mss_donors_maine$Amount[j]
county_totals$num_donations[i] <- county_totals$num_donations[i] + 1
}
}
}
# merging the county donation totals in to the county spatial data using dplyr::left_join
county_data_merged <- left_join(counties_simplified, county_totals, by = join_by("COUNTY" == "county"))