Examples with wiki_utils

Angel Zazo, Department of Computer Science and Automatics, University of Salamanca

2024-04-13

Functions

Functions to obtain a list of Wikidata entities

w_SearchByLabel(string, langsorder=‘en’, lang=““, instanceof=”“, Pproperty=”“, mode=c(”exact”,“startswith”,“inlabel”))

w_OccupationEntities(Qoc, nlimit=NULL, mode=c(‘default’,‘count’,‘wikipedias’))

Function to obtain information from a list of Wikidata entities or a single one.

w_isInstanceOf(entity_list, instanceof)

w_Wikipedias(entity_list, wikilangs=““, instanceof=”“, nlimit=1500)

w_isValid(entity_list, nlimit=50000)

w_Property(entity_list, Pproperty, langsorder=‘en’, nlimit=10000)

w_IdentifiersOfAuthority(Pauthority, langsorder=‘en’, instanceof=““)

Pauthority = Authority Database Property in Wikidata

w_EntityInfo(entity, langsorder=‘en’, wikilangs=““, mode=c(‘default’,‘tiny’,‘film’))

Functions to obtain information using the WikiMedia API’s

m_Opensearch(string, project=‘en.wikipedia.org’, profile=“engine_autoselect”, redirects=“resolve”)

m_reqMediaWiki(titles, mode=c(‘wikidataEntity’,‘redirects’,‘pagePrimaryImage’,‘pageFiles’), project=‘en.wikipedia.org’, redirects=TRUE, exclude_ext=‘svg|webp|xcf’)

m_Pageviews(article, start, end, project=“en.wikipedia.org”, access=“all-access”, agent=“user”, granularity=“monthly”, redirects=FALSE)

m_XtoolsInfo(article, infotype=“articleinfo”, project=“en.wikipedia.org”, redirects=FALSE)

Functions to obtain information (viafID or cluster records) using the VIAF API

v_AutoSuggest(author) : obtains viafID

v_Search(CQL_Query, mode=c(‘default’, ‘anyField’, ‘allmainHeadingEl’, ‘allNames’, ‘allPersonalNames’, ‘allTitle’)) : obtains clusters records

Function to retrieve a cluster record using the viafID.

v_GetRecord(viafid, record_format=‘viaf.json’): retrieve a cluster record

Function to extract information from a VIAF cluster record

v_Extract(viaf, info, source=NULL)

Package installation and loading

To install and load the updated version of the wikiTools package simply run the following commands:

install.packages("wikiTools")
library(wikiTools)

Examples of Wikidata functions using WDQS

Search string “Iranzo” in different positions

Exact search in Label or exact search in AltLabel (case sensitive and diacritics)

Optional: limit by instanceof Wikidata class (Qxx).

Optional: return information of some properties (Pproperties, Pxxx).

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5|Q101352')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570')

Search at the beginning in Label or AltLabel (diacritics and case are ignored)

df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en', mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en', instanceof = 'Q5',
                      mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en',
                      instanceof = 'Q5|Q101352', mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570', mode='startswith')

Search in any position in Label or AltLabel (diacritics and case are ignored)

If lang==’’ search in any language, else the search is performed only in the language indicated.

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', mode='inlabel')

Search only in Chinese (Simplified) (language code: zh):

df <- w_SearchByLabel(string='Iranzo', langsorder='zh|es', lang='zh', mode='inlabel')

Optional instanceof and Property

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      mode='inlabel')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5|Q101352',
                      mode='inlabel')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570', mode='inlabel')

aux: getting a vector of entities (l) to use later.

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', mode='inlabel')
l <- df$entity

w_isInstanceOf

Check if elements in entity_list are instance of a Wikimedia class

df <- w_isInstanceOf(entity_list=l, instanceof='Q5')
# Not TRUE
df[!df$instanceof_Q5,]
##                entity instanceof_Q5
## Q45987474   Q45987474         FALSE
## Q85684513   Q85684513         FALSE
## Q117783790 Q117783790         FALSE
## Q47034606   Q47034606         FALSE
## Q45976259   Q45976259         FALSE
## Q11912738   Q11912738         FALSE
## Q97101009   Q97101009         FALSE
## Q111015546 Q111015546         FALSE
## Q97101007   Q97101007         FALSE
## Q31835108   Q31835108         FALSE
## Q6058550     Q6058550         FALSE

w_Wikipedias

Search for Wikipedia pages in all/some languages

Optional: instanceOF (limit to entities which are instance of a Wikidata class)

df <- w_Wikipedias(entity_list=l)
df <- w_Wikipedias(entity_list=l, wikilangs='es|en|fr')
df <- w_Wikipedias(entity_list=l, wikilangs='es|en|fr', instanceof="Q5")

w_Occupations

Count entities, or get the entities with that occupation, also get Wikipedia pages

Note: depending on connection speed, nlimit parameter musts be adjusted

w_OccupationEntities(Qoc='Q2306091', mode='count') # Qoc for Sociologist
## [1] 19308
l  <- w_OccupationEntities(Qoc='Q2306091') # l=entities: vector
lw <- w_OccupationEntities(Qoc='Q2306091', mode='wikipedias') # lw=dataframe
 # We can obtain the same information using previous function w_Wikipedias:
 lw2 <- w_Wikipedias(entity_list=l, wikilangs='')
 # Verifying:
 all(lw['Q10320558','pages'] == lw2['Q10320558','pages'])
 # Verifying:
 all(sort(strsplit(lw['Q9061', 'pages'], '|', fixed = T)[[1]]) ==
     sort(strsplit(lw2['Q9061', 'pages'], '|', fixed = T)[[1]]))

w_isValid.

Check if the Wikidata entities are valid. A entity is valid if it has a label or has a description. If one entity exists but is not valid, is possible that it has a redirection to other entity, in that case, the redirection is obtained. Other entities may have existed in the past, but they are currently deleted.

l2 <- append(l, c("Q115637688", "Q105660123"))  # Note: adding two new entities
v <- w_isValid(l2)
# Not valid
v[!v$valid,]
##                entity valid redirection
## Q115637688 Q115637688 FALSE            
## Q105660123 Q105660123 FALSE   Q97352588

w_Property

Obtain properties of entity_list.

p <- w_Property(l, Pproperty = 'P21|P569|P214', langsorder = 'es|en')

w_IdentifiersOfAuthority

Search for Wikidata entities that have an identifier in the Wikidata authority property “Pauthority”.

Optional: instanceOf

Example: Pauthority=P4439 (has identifier in the Museo Nacional Centro de Arte Reina Sofía)

mncars   <- w_IdentifiersOfAuthority(Pauthority="P4439", langsorder = 'es|en')
# 1286  [human, groups, etc.]
mncarsQ5 <- w_IdentifiersOfAuthority(Pauthority="P4439", langsorder = 'es|en',
                                     instanceof = 'Q5')  # 1280
# Entities are not 'human' (Q5) [see entityDescription column):
mncars[!(mncars$entity %in% mncarsQ5$entity),]  # not instance of Q5.
##                entity                                  entityLabel
## Q105687869 Q105687869            João Maria Gusmão and Pedro Paiva
## Q4517304     Q4517304                                  Chto Delat?
## Q5849776     Q5849776                                 Estrujenbank
## Q20102460   Q20102460                        Agustín Parejo School
## Q27657364   Q27657364 Midnight Gardening (Jardinería a medianoche)
## Q317874       Q317874                                     Ant Farm
##                                   entityDescription                 P4439
## Q105687869                  Portuguese artistic duo     gusmao-joao-maria
## Q4517304                     Russian art collective            chto-delat
## Q5849776                        colectivo artístico          estrujenbank
## Q20102460                                           agustin-parejo-school
## Q27657364                 cuadro de Jerónimo Elespe       elespe-jeronimo
## Q317874    American art and architecture collective              ant-farm

w_EntityInfo

Get some properties of a Wikidata entity.

df1 <- w_EntityInfo(entity='Q134644', langsorder = 'es|en')
# Also a "tiny" version
df2 <- w_EntityInfo(entity='Q134644', langsorder = 'es|en', mode='tiny')
# Differences: fields non existing in the tiny row set as "--":
Aleixandre <- rbind(
  df1,
  data.frame(c(df2, sapply(setdiff(names(df1), names(df2)), function(x) "--")),
             row.names = 'tiny')
)
BenHur    <- w_EntityInfo(entity='Q180098', langsorder='es|en',
                          wikilangs = 'es|fr', mode='film')
Nosferatu <- w_EntityInfo(entity='Q151895', langsorder='es|en',
                          wikilangs = 'es|fr|en', mode='film')
# Nosferatu has a public video:
Nosferatu$video
## [1] "http://commons.wikimedia.org/wiki/Special:FilePath/Nosferatu%20%281922%2C%20English%20titles%201947%29.webm"
# Combining data-frames:
films <- rbind(BenHur, Nosferatu)

Examples of WikiMedia functions

m_Opensearch

Search articles that contains any words (note: it is better to use a large string)

Some search profiles:

df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org',
                   profile="engine_autoselect", redirects="resolve")
df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org', profile="strict")
df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org', profile="fuzzy")

m_reqMediaWiki

Checks if titles are in a Wikimedia project and returns the Wikidata entity for them, if they have one.

Note that URLdecode(“a%CC%8C”) is the letter “a” with the combining caron (ǎ)

df <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                        mode='wikidataEntity', project='en.wikipedia.org')

Obtains the redirections of a page (the page itself can be a redirect to other page).

Returns a vector for each title, in each vector the first element is the destiny, rest are all pages that redirect to it.

a <- m_reqMediaWiki(c('Cervantes', 'Planck', 'Noexiste'), mode='redirects',
                    project='es.wikipedia.org')
a
## $Cervantes
##  [1] "Miguel de Cervantes"            "Miguel de Cerbantes"           
##  [3] "Miguel de Cervantes y Saavedra" "Miguel De Cervantes y Saavedra"
##  [5] "El manco de Lepanto"            "Miguel de cervantes"           
##  [7] "Manco de Lepanto"               "Don Miguel de Cervantes"       
##  [9] "Cervantino"                     "Cervantina"                    
## [11] "Miguel de Cervantes Saavedra"   "Cervantes Saavedra, Miguel de" 
## [13] "Miguel de Cervantes y Cortinas" "Cervantesco"                   
## [15] "Cervántico"                     "Cervantes"                     
## 
## $Planck
## [1] "Max Planck"                   "Planck"                      
## [3] "Max Karl Ernst Ludwig Planck"
## 
## $Noexiste
## [1] NA

Gets the URL of de Primary image as a URL of Wikimedia pages.

Gets all URL of files inserted in the pages (images, sounds, videos…), using ‘|’ as separator, and excluding some extensions in the exclude_ext parameter.

Both functions automatically resolve redirects (the destiny is the “normalized” column of the data-frame returned).

i <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                  mode='pagePrimaryImage')

f <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                  mode='pageFiles', exclude_ext = "svg|webp|xcf")

m_Pageviews

Gets visits that a page have had in a date interval

Optional: redirects

v <-  m_Pageviews(article="Cervantes", start="20230101", end="20230501",
                   project="es.wikipedia.org", granularity="monthly")
vv <- m_Pageviews(article="Cervantes", start="20230101", end="20230501",
                   project="es.wikipedia.org", granularity="monthly",
                   redirects=TRUE)

m_XtoolsInfo

Obtains information (as vector) about an article in the Wikimedia project.

Infotype: articleinfo, prose, links

Optional: redirects

x <-  m_XtoolsInfo(article="Cervantes", infotype="articleinfo", project="es.wikipedia.org")
xx <- m_XtoolsInfo(article="Cervantes", infotype="articleinfo", project="es.wikipedia.org",
                   redirects=TRUE)

y <-  m_XtoolsInfo(article="Miguel de Cervantes", infotype="links", project="es.wikipedia.org")
yy <- m_XtoolsInfo(article="Cervantes", infotype="links", project="es.wikipedia.org",
                    redirects=TRUE)

Gets all information (articleinfo, prose, links).

z  <- m_XtoolsInfo(article="Miguel de Cervantes", infotype="all", project="es.wikipedia.org")
zz <- m_XtoolsInfo(article="Cervantes", infotype="all", project="es.wikipedia.org",
                       redirects=TRUE)

Examples using VIAF functions

v_AutoSuggest

Searches authors. Sometimes the same author appears several times, under a different name).

Return a data-frame.

Important: The API returns a maximum of 10 records.

v_AutoSuggest('Iranzo')
##       term                                                  score  nametype  
##  [1,] "Iranzo, Antonio, 1930-2003"                          "1563" "personal"
##  [2,] "Iranzo, Carmen"                                      "1439" "personal"
##  [3,] "Iranzo, Miguel Lucas de"                             "1392" "personal"
##  [4,] "Iranzo, G., 1918-1998"                               "1365" "personal"
##  [5,] "Iranzo Muñío, María Teresa, 19..-"                   "1346" "personal"
##  [6,] "Iranzo Simón, Víctor 1850-1890"                      "1220" "personal"
##  [7,] "Iranzo Benedito, Manuel, 1867-1921"                  "1202" "personal"
##  [8,] "Iranzo, Olga"                                        "1190" "personal"
##  [9,] "Iranzo Bielsa, José, el Pastor de Andorra 1915-2016" "1170" "personal"
## [10,] "Iranzo Martín, Juan Emilio 1956-"                    "1162" "personal"
##       viafid                
##  [1,] "87262213"            
##  [2,] "46775630"            
##  [3,] "3268989"             
##  [4,] "88012748"            
##  [5,] "48297869"            
##  [6,] "87244676"            
##  [7,] "87100730"            
##  [8,] "49150565569906250223"
##  [9,] "63243927"            
## [10,] "58295559"
v_AutoSuggest('Esparza, María')
##       term                                                                    
##  [1,] "Esparza, María 1898-1978"                                              
##  [2,] "Esparza, María Jesús"                                                  
##  [3,] "Esparza, Mariana Ochoa"                                                
##  [4,] "Esparza, María Elena"                                                  
##  [5,] "Esparza, Maria, 19..-...., auteure d'une thèse de sciences biologiques"
##  [6,] "Esparza, María Del Rosario Campos-"                                    
##  [7,] "Esparza, María Sanjuana Salazar"                                       
##  [8,] "Esparza, María"                                                        
##  [9,] "Esparza, María del Carmen Hernández"                                   
## [10,] "Esparza, María de los Angeles Cervantes"                               
##       score  nametype   viafid                  
##  [1,] "1022" "personal" "1335154741632153110006"
##  [2,] "1013" "personal" "9147370712141442276"   
##  [3,] "1012" "personal" "67961098"              
##  [4,] "1012" "personal" "48986872"              
##  [5,] "1011" "personal" "250164959833724021614" 
##  [6,] "515"  "personal" "118153833232264332271" 
##  [7,] "509"  "personal" "121210664"             
##  [8,] "509"  "personal" "466160667844803560008" 
##  [9,] "508"  "personal" "103543869"             
## [10,] "508"  "personal" "31757197"
v_AutoSuggest('Escobar, Modesto')
##      term                                  score  nametype   viafid    
## [1,] "Escobar, Modesto, 1958-"             "1286" "personal" "75898534"
## [2,] "Escobar, Modesto"                    "1278" "personal" "75898534"
## [3,] "Escobar, Modesto, 1940-"             "1246" "personal" "6744770" 
## [4,] "Escobar, Modesto, (Escobar Espinar)" "628"  "personal" "6744770"
# Note that four rows are returned, but only two different viafids.