This is the script that scrapes and assembles the data used by the package. While both the script and the data are included, this script has a very high chance of breaking, given the likelihood of websites or URLs changing, locales differing, etc. If you’d like to assemble the data differently or are just curious how it was assembled, though, this script will be of use.
A function to refresh data is not included in the package because this script is depends on web structures that may vary, requires a lot of dependencies (see below) and is irrelevant to the use of the package. If you do choose to run it, keep in mind that it scrapes a lot of sources; please be curteous and don’t re-scrape more times than necessary. Due to the number and size of sources, it will take a while to run, anyway, and so is probably best run by chunk. While structured as an RMarkdown notebook for ease of documentation and structure, the HTML file it knits is not useful and is thus not included.
Packages required:
Aside from rvest and tidyverse, most could be fairly easily refactored into an alternative.
library(rvest)
Loading required package: xml2
library(tidyverse)
+ ggplot2 2.2.1.9000 Date: 2017-07-06
+ tibble 1.3.3 R: 3.4.0
+ tidyr 0.6.3.9000 GUI: RStudio 1.0.143
+ readr 1.1.1.9000 Locale: en_US.UTF-8
+ purrr 0.2.2.9000 TZ: America/New_York
+ dplyr 0.7.1.9000
+ stringr 1.2.0.9000
Conflicts --------------------------------------------------------------------
* filter(), from dplyr, masks stats::filter()
* lag(), from dplyr, masks stats::lag()
# CIA World Factbook country names
country_names <- 'https://www.cia.gov/library/publications/the-world-factbook/fields/2142.html' %>%
read_html() %>%
html_nodes('table tr[id]') %>%
{ setNames(map(., html_nodes, 'td'), toupper(html_attr(., 'id'))) } %>%
map_df(~data_frame(
country = html_text(.x[1]),
key = map(.x[2], html_nodes, css = 'strong') %>% map(html_text),
value = map(.x[2], html_nodes, xpath = 'text()') %>%
map(html_text, trim = TRUE) %>%
map(discard, `==`, '')
), .id = 'gec') %>%
unnest() %>%
mutate(key = gsub(':\\s+$|`', '', key),
key = gsub('etymolgy', 'etymology', key),
key = gsub('official|Papiamentu', 'local', key),
key = gsub('English', 'conventional', key),
key = ifelse(country == 'Curacao',
gsub('Dutch', 'conventional', key),
gsub('Dutch', 'local', key)),
country = ifelse(gec == 'AS', 'Australia', country)) %>%
spread(key, value) %>%
mutate(abbreviation = ifelse(country == 'Australia', NA, abbreviation))
# CIA World Factbook country codes
country_codes <- 'https://www.cia.gov/library/publications/the-world-factbook/appendix/appendix-d.html' %>%
read_html() %>%
html_nodes('ul#GetAppendix_D li') %>%
map(html_nodes, css = 'td') %>%
map(html_text, trim = TRUE) %>%
map(~.[c(-3, -10)]) %>%
transpose() %>%
simplify_all() %>%
setNames(c('country', 'gec', 'iso2c', 'iso3c', 'iso3n', 'stanag', 'tld', 'comment')) %>%
as_data_frame() %>%
mutate_all(na_if, y = '-') %>%
mutate(comment = na_if(comment, ''))
# http://geonames.nga.mil/gns/html/countrycodes.html
# http://geonames.nga.mil/gns/html/docs/GENC_ED3U5_GEC_XWALK.xlsx
genc <- rio::import('http://geonames.nga.mil/gns/html/docs/GENC_ED3U5_GEC_XWALK.xlsx',
skip = 2, setclass = 'tbl_df', na = '--') %>%
modify(na_if, '[None]')
# Wikipedia country codes
w <- 'https://en.wikipedia.org/wiki/Category:Lists_of_country_codes' %>%
read_html() %>%
html_nodes('a[title*="Country codes:"]') %>%
html_attr('href') %>%
paste0('https://en.wikipedia.org', .) %>%
map(read_html) %>%
map(html_nodes, 'h2 + table') %>%
modify_depth(2, html_nodes, 'td') %>%
map(map_df, ~list(
key = map(.x, html_nodes, xpath = 'a|text()') %>%
map(html_text) %>%
map_chr(paste, collapse = '') %>%
trimws(),
value = html_nodes(.x, 'p') %>% html_text()
), .id = 'row') %>%
map_df(spread, key, value) %>%
select(-row) %>%
mutate_all(na_if, y = '—') %>% modify(na_if, '-') %>%
modify(~gsub('\\n', ', ', .x)) %>%
setNames(c('tld', 'calling', 'mcc', 'gec', 'gs1_gtin', 'icao_aircraft', 'icao_airport', 'ioc', 'iso2c', 'iso3c', 'iso3n', 'itu_callsign', 'itu', 'itu_maritime', 'license_plate', 'marc', 'stanag', 'nato2c', 'undp', 'wmo'))
# # Wikipedia FIFA codes - unused
# fifa <- 'https://en.wikipedia.org/wiki/List_of_FIFA_country_codes' %>%
# read_html() %>%
# html_nodes('table.wikitable') %>%
# map(html_table, fill = TRUE) %>%
# .[-7:-9] %>% # remove obsolete codes tables
# reduce(full_join) %>%
# mutate(FIFA = coalesce(Code, FIFA)) %>%
# select(-Code, -Confederation) %>%
# mutate_all(funs(gsub('\\[.*\\]', '', .))) %>% mutate_all(na_if, y = '') %>%
# setNames(tolower(names(.))) %>% rename(iso3c = iso)
Unused except for corrections of Wikipedia innacuracies.
itu <- 'https://www.itu.int/online/mm/scripts/gensel8' %>%
read_html() %>%
html_node('table') %>%
html_table(header = TRUE) %>%
select(-5) %>%
set_names(c('en_iso', 'itu_region', 'itu', 'tld')) %>%
mutate(tld = tolower(tld))
unstats <- 'https://unstats.un.org/unsd/methodology/m49/overview/' %>%
read_html() %>%
html_nodes('table') %>%
map_df(~html_table(.x, header = TRUE) %>%
mutate(language = tolower(sub('downloadTable', '',
html_attr(.x, 'id'))))) %>%
tbl_df()
okfn <- read_csv('https://github.com/datasets/country-codes/raw/master/data/country-codes.csv')
# http://unicode.org/copyright.html
# http://unicode.org/repos/cldr/trunk/unicode-license.txt
# modern
langs_modern <- 'https://github.com/unicode-cldr/cldr-localenames-modern/tree/master/main' %>%
read_html() %>%
html_nodes('.content span a') %>% html_text()
unicode_modern <- langs_modern %>%
set_names(
paste0('https://github.com/unicode-cldr/cldr-localenames-modern/raw/master/main/',
., '/territories.json'),
.) %>%
map(jsonlite::fromJSON) %>%
map(c(1, 1, 2, 1)) %>%
simplify_all() %>%
map2(names(.),
~set_names(data_frame(names(.x), .x),
c('code', .y))) %>%
reduce(full_join, by = 'code')
# # unused
# lang_codes <- 'https://github.com/unicode-cldr/cldr-localenames-modern/raw/master/main/en-US-POSIX/languages.json' %>%
# jsonlite::fromJSON() %>%
# map(c(1,2,1)) %>% .[[1]] %>%
# simplify()
#
# lang_code_df <- data_frame(language = lang_codes,
# code = names(lang_codes))
unicode_codes <- 'https://github.com/unicode-cldr/cldr-core/raw/master/supplemental/codeMappings.json' %>%
jsonlite::fromJSON() %>%
.[[1]] %>% .[[2]] %>%
bind_rows(.id = 'iso2c') %>%
set_names(c('iso2c', 'iso3n', 'iso3c', 'gec', 'tld')) %>%
select(-tld) %>%
mutate(iso3c = coalesce(iso3c, iso2c),
iso2c = ifelse(nchar(iso2c) > 2, NA, iso2c))
countrycode_data_c <- codelist %>%
mutate(iso2c = ifelse(genc2c == 'XK', 'XK', iso2c)) %>%
df <- countrycode::countrycode_data %>% select(iso2c, matches('regex')) %>%
drop_na(iso2c) %>%
janitor::clean_names() %>%
set_names(sub('country_name_', '', names(.)))
Error in codelist %>% mutate(iso2c = ifelse(genc2c == "XK", "XK", iso2c)) %>% :
could not find function "%>%<-"
cia <- full_join(country_codes_c %>% drop_na(gec), country_names_c)
Joining, by = "gec"
usg <- full_join(genc_c %>% drop_na(gec), cia, by = 'gec') %>%
mutate(iso3c = coalesce(iso3c.x, iso3c.y),
iso2c = coalesce(iso2c.x, iso2c.y),
iso3n = coalesce(iso3n.x, iso3n.y)) %>%
select(-matches('\\.'))
# Sacrifice Channel Islands for Sark
ok_un <- okfn_c %>% slice(-1) %>% full_join(unstats_c)
Joining, by = "iso3c"
us_ok_un <- full_join(ok_un, usg, by = 'iso3n') %>%
mutate(iso3c = coalesce(iso3c.x, iso3c.y),
iso2c = coalesce(iso2c.x, iso2c.y),
gec = coalesce(gec.x, gec.y),
tld = coalesce(tld.x, tld.y)) %>%
select(-matches('\\.'))
us_ok_un_w <- full_join(us_ok_un, w_c, by = 'iso2c') %>%
mutate(iso3c = coalesce(iso3c.x, iso3c.y),
iso3n = coalesce(iso3n.x, iso3n.y),
gec = coalesce(gec.x, gec.y),
marc = coalesce(marc.x, marc.y),
# wmo = coalesce(wmo.x, wmo.y), # w/okfn codes differ, and no official list to verify
calling = coalesce(calling.x, calling.y),
ioc = coalesce(ioc.x, ioc.y),
stanag = coalesce(stanag.x, stanag.y),
tld = coalesce(tld.x, tld.y)) %>%
select(-matches('\\.'))
unicode <- unicode_codes %>%
drop_na(iso2c) %>%
right_join(unicode_modern_c, by = 'iso2c') %>%
mutate(iso3n = coalesce(iso3n.x, iso3n.y)) %>%
select(-matches('\\.')) %>%
mutate(iso3n = case_when(iso2c == 'CP' ~ '905',
iso2c == 'DG' ~ '908',
iso2c == 'XK' ~ '901',
TRUE ~ iso3n),
iso3c = ifelse(iso2c == 'XK', NA, iso3c))
countries <- full_join(unicode, us_ok_un_w, by = c('iso3n', 'iso2c', 'alt')) %>%
mutate(iso3c = coalesce(iso3c.x, iso3c.y),
gec = coalesce(gec.x, gec.y)) %>%
select(-matches('\\.'))
countries <- left_join(countries, countrycode_data_c)
Joining, by = "iso2c"
# check code duplication
us_ok_un_w %>% names() %>%
map_int(~ countries %>%
filter(is.na(alt)) %>%
group_by_(.dots = .x) %>%
filter(n() > 1) %>%
select(gec, en, iso3c, iso2c, iso3n) %>%
.[!is.na(.[[.x]]),] %>%
nrow()) %>%
set_names(names(us_ok_un_w))
Adding missing grouping variables: `en_iso`
Adding missing grouping variables: `fr_iso`
Adding missing grouping variables: `fifa`
Adding missing grouping variables: `gaul`
Adding missing grouping variables: `iso4217_3c`
Adding missing grouping variables: `iso4217_name`
Adding missing grouping variables: `iso4217_3n`
Adding missing grouping variables: `is_independent`
Adding missing grouping variables: `capital`
Adding missing grouping variables: `continent`
Adding missing grouping variables: `edgar`
Adding missing grouping variables: `un_region_code`
Adding missing grouping variables: `un_subregion_code`
Adding missing grouping variables: `un_intermediate_region_code`
Adding missing grouping variables: `m49`
Adding missing grouping variables: `ldc`
Adding missing grouping variables: `lldc`
Adding missing grouping variables: `sids`
Adding missing grouping variables: `is_developed`
Adding missing grouping variables: `ar_un`
Adding missing grouping variables: `ar_un_intermediate_region`
Adding missing grouping variables: `ar_un_region`
Adding missing grouping variables: `ar_un_subregion`
Adding missing grouping variables: `en_un`
Adding missing grouping variables: `en_un_intermediate_region`
Adding missing grouping variables: `en_un_region`
Adding missing grouping variables: `en_un_subregion`
Adding missing grouping variables: `es_un`
Adding missing grouping variables: `es_un_intermediate_region`
Adding missing grouping variables: `es_un_region`
Adding missing grouping variables: `es_un_subregion`
Adding missing grouping variables: `fr_un`
Adding missing grouping variables: `fr_un_intermediate_region`
Adding missing grouping variables: `fr_un_region`
Adding missing grouping variables: `fr_un_subregion`
Adding missing grouping variables: `ru_un`
Adding missing grouping variables: `ru_un_intermediate_region`
Adding missing grouping variables: `ru_un_region`
Adding missing grouping variables: `ru_un_subregion`
Adding missing grouping variables: `zh_un`
Adding missing grouping variables: `zh_un_intermediate_region`
Adding missing grouping variables: `zh_un_region`
Adding missing grouping variables: `zh_un_subregion`
Adding missing grouping variables: `en_cia_abbreviation`
Adding missing grouping variables: `alt`
Adding missing grouping variables: `en_cia`
Adding missing grouping variables: `en_cia_local`
Adding missing grouping variables: `mcc`
Adding missing grouping variables: `icao_aircraft`
Adding missing grouping variables: `icao_airport`
Adding missing grouping variables: `itu_callsign`
Adding missing grouping variables: `itu`
Adding missing grouping variables: `itu_maritime`
Adding missing grouping variables: `license_plate`
Adding missing grouping variables: `nato2c`
Adding missing grouping variables: `undp`
Adding missing grouping variables: `marc`
Adding missing grouping variables: `calling`
Adding missing grouping variables: `ioc`
Adding missing grouping variables: `stanag`
Adding missing grouping variables: `tld`
en_iso fr_iso
0 0
fifa gaul
0 0
iso4217_3c iso4217_name
104 104
iso4217_3n is_independent
104 245
capital continent
2 208
edgar un_region_code
0 248
un_subregion_code un_intermediate_region_code
248 108
m49 ldc
0 249
lldc sids
249 249
is_developed ar_un
248 0
ar_un_intermediate_region ar_un_region
108 248
ar_un_subregion en_un
248 0
en_un_intermediate_region en_un_region
108 248
en_un_subregion es_un
248 0
es_un_intermediate_region es_un_region
108 248
es_un_subregion fr_un
248 0
fr_un_intermediate_region fr_un_region
108 248
fr_un_subregion ru_un
248 0
ru_un_intermediate_region ru_un_region
108 248
ru_un_subregion zh_un
248 0
zh_un_intermediate_region zh_un_region
108 248
zh_un_subregion en_cia_abbreviation
248 0
alt en_cia
0 0
en_cia_local iso2c
0 0
mcc icao_aircraft
20 39
icao_airport itu_callsign
25 3
itu itu_maritime
0 0
license_plate nato2c
9 0
undp iso3c
0 0
iso3n gec
0 0
marc calling
3 32
ioc stanag
0 22
tld
5
countries_colnames <- names(countries)
codes <- data_frame(column = countries_colnames,
code = gsub('_', '-', column)) %>%
mutate(expansion = map(code, safely(NLP::parse_IETF_language_tag), expand = TRUE),
expansion = map(expansion, c(1, 1)),
expansion = map(expansion, ~if(length(.x) == 0) {c(Language = NA_character_)} else .x),
expansion = map(expansion, map_df, ~suppressWarnings(na_if(toString(na.omit(.x)), ''))),
expansion = map(expansion, ~set_names(.x, sub('=.*', '', names(.x))))) %>%
unnest() %>%
mutate(Language = case_when(column == 'alt' ~ NA_character_,
column == 'mcc' ~ NA_character_,
column == 'tld' ~ NA_character_,
TRUE ~ Language),
Variant = ifelse(column == 'en_us_posix', 'POSIX', Variant),
Variant = ifelse(grepl('_iso', column), 'ISO', Variant),
Variant = ifelse(grepl('_cia', column), 'CIA World Factbook', Variant),
Extension = ifelse(grepl('_cia_', column),
sub('en_cia_', '', column), Extension),
name = case_when(
column == 'iso2c' ~ 'ISO 3166-1 Alpha-2 code',
column == 'iso3c' ~ 'ISO 3166-1 Alpha-3 code',
column == 'iso3n' ~ 'ISO 3166-1 numeric code',
column == 'en_iso' ~ 'ISO English name',
column == 'fr_iso' ~ 'ISO French name',
column == 'gec' ~ 'Geopolitical Entities and Codes',
column == 'fifa' ~ 'FIFA (Fédération Internationale de Football Association) code',
column == 'gaul' ~ 'Global Administrative Unit Layers from the Food and Agriculture Organization (FAO) code',
column == 'iso4217_3c' ~ 'ISO 4217 3-character currency code',
column == 'iso4217_name' ~ 'ISO 4217 currency name',
column == 'iso4217_3n' ~ 'ISO 4217 numeric currency code',
column == 'is_independent' ~ 'Country sovereignty status from the CIA World Factbook',
column == 'capital' ~ 'Capital city',
column == 'edgar' ~ 'EDGAR country code from the SEC',
column == 'en_cia' ~ 'Country names from the CIA World Factbook',
column == 'en_cia_local' ~ 'Local country names from the CIA World Factbook',
column == 'en_cia_abbreviation' ~ 'Commonly used country abbreviations from the CIA World Factbook.',
column == 'mcc' ~ 'International Telecommunication Union (ITU) Telecommunication Standardization Sector (ITU-T) E.212 Mobile Country Code',
column == 'itu_callsign' ~ 'International Telecommunication Union (ITU) callsign prefixes for radio and television stations',
column == 'itu' ~ 'International Telecommunication Union (ITU) 1-3 character country code',
column == 'itu_maritime' ~ 'International Telecommunication Union (ITU) Maritime Identification Digits',
column == 'license_plate' ~ 'Motor vehicle licence plate country code',
column == 'stanag' ~ 'North Atlantic Treaty Organization (NATO/OTAN) STANAG 1059 Letter Codes for Geographical Entities',
column == 'nato2c' ~ 'North Atlantic Treaty Organization (NATO/OTAN) 2-letter code.',
column == 'undp' ~ 'United Nations Development Programme (UNDP) country code',
column == 'marc' ~ 'MAchine-Readable Cataloging (MARC) codes from the US Library of Congress',
column == 'calling' ~ 'International Telecommunication Union (ITU) Telecommunication Standardization Sector (ITU-T) E.164 international telephone calling code',
column == 'ioc' ~ 'International Olympic Committee country code',
column == 'tld' ~ 'Internet Assigned Numbers Authority (IANA) country code top-level domain',
column == 'm49' ~ 'United Nations Statistics Division (UNSD) M.49 area code',
column == 'ldc' ~ 'United Nations (UN) Least Developed Countries',
column == 'lldc' ~ 'United Nations (UN) Land Locked Developing Countries',
column == 'sids' ~ 'United Nations (UN) Small Island Developing States',
column == 'is_developed' ~ 'United Nations (UN) development status',
grepl('_un', column) ~ 'United Nations (UN) Geoscheme region name',
TRUE ~ NA_character_
),
notes = case_when(
column == 'gec' ~ 'Formerly FIPS Pub 10-4, which was withdrawn by NIST in 2008. Maintained until 2014 by the National Geospatial-Intelligence Agency (NGA), after which it was frozen and superceded by GENC, the US government profile of ISO 3166.',
column == 'en_iso' ~ 'Does not include Taiwan, which is not a member of the UN. A few use uncommon offical forms, including North and South Korea and Bolivia.',
column == 'fr_iso' ~ 'Does not include Taiwan, which is not a member of the UN. A few use uncommon offical forms, including North and South Korea and Bolivia.',
column == 'en_cia' ~ 'Many short and long forms. Uses "Burma" instead of "Myanmar".',
column == 'en_cia_local' ~ 'Many short and long forms. Includes alternatives inline. Transliterates to Latin script.',
column == 'en_cia_abbreviation' ~ 'Only included where commonly used. Includes alternatives inline.',
column == 'mcc' ~ 'Includes ranges.',
column == 'itu_callsign' ~ 'Includes ranges.',
column == 'itu_maritime' ~ 'Includes ranges.',
column == 'nato2c' ~ 'Officially deprecated in favor of STANAG 1059 (see "stanag").',
column == 'tld' ~ 'Includes leading period.',
grepl('_un_region', column) ~ 'Continent',
grepl('regex', column) ~ 'Regex used for `parse_country`.',
TRUE ~ NA_character_
)) %>%
janitor::clean_names() %>%
arrange(column) %>%
select(1:2, 8:9, 3:7)
# to avoid ASCII CRAN warning
codes <- codes %>% modify(stringi::stri_trans_general, 'Latin-ASCII')
devtools::use_data(countries, countries_colnames,
internal = TRUE, overwrite = TRUE)
Saving countries, countries_colnames as sysdata.rda to /Users/alistaire/Documents/R_projects/passport/R
devtools::use_data(codes, overwrite = TRUE)
Saving codes as codes.rda to /Users/alistaire/Documents/R_projects/passport/data
Data is licensed according to its source, most of which are in the public domain. Exceptions include