Module:Internet Archive
Appearance
This module issubject to page protection.It is ahighly visible modulein use by a very large number of pages, or issubstitutedvery frequently. Because vandalism or mistakes would affect many pages, and even trivial editing might cause substantial load on the servers, it isprotectedfrom editing. |
This Lua module is used onapproximately 19,000 pagesand changes may be widely noticed. Test changes in the module's/sandboxor/testcasessubpages, or in your ownmodule sandbox.Consider discussing changes on thetalk pagebefore implementing them. |
Usage
There is currently 1 template that invokes this module,{{Internet Archive author}}
.
If future Lua scripts for Internet Archive are created (books, film, audio, etc), this Module would be a natural location to build.
--[[
For functions related to Internet Archive
Notes:
1. Internet Archive runs Elasticsearch search engine as of 4 Nov 2015
2. Program flowchart:
Break name down into number of words
Build a base URL based on number of words (1,2,3,4,5+), use of sopt=t switch, and availability of birth-death dates
If any words contain extended-ascii characters
append extra code for wildcards based on sopt=t or w
return finished URL
3. URL length should not exceed 2000 characters or it will break certain popular browsers
4. Wildcard (*) replacements should be avoided in the first letter of the first word, and with any single-letter words
5. Changing search formulations will have impacts on existing uses of the template and off-line tools which are optimized for these search recipes.
]]
localp={}
--[[
For Template:Internet Archive author
]]
functionp.author(frame)
localpframe=frame:getParent()
localargs=pframe.args
localtname="Internet Archive author"-- name of calling template. Change if template rename.
localname=nil-- article name (default: current page name)
dname=nil-- display name (default: current page name)
localsname=nil-- search name (default: current page name)
localsopt=nil-- search options (default: nil)
byabout="Works by or about"
tagline="at the [[Internet Archive]]"
urlhead="https://archive.org/search.php?query="
mydate=""-- birth-death date
--- Determine name
name=trimArg(args.name)-- When using template outside main article space, the 'name' parameter is required (not optional)
ifnotnamethen
name=mw.title.getCurrentTitle().text
end
dname=mw.ustring.gsub(name,'%s+%([^%(]-%)$','')-- Remove the final disambig parentheses
sname=dname
iftrimArg(args.sname)then
sname=trimArg(args.sname)
end
iftrimArg(args.dname)then
dname=trimArg(args.dname)
end
--- Determine search option
sopt=trimArg(args.sopt)
ifsoptthen
sopt=mw.ustring.lower(sopt)
ifsopt=="tight"thensopt="t"end
ifsopt=="tightx"thensopt="tx"end
ifsopt=="wild"thensopt="w"end
ifsopt~="t"andsopt~="tx"andsopt~="w"thensopt="unknown"end
end
--- Determine tagline
iftrimArg(args.coda)then
tagline=tagline..""..trimArg(args.coda)
end
--- Custom search. Do early to avoid unnecessary processing.
iftrimArg(args.search)then
localsearch=p.ia_url_encode(trimArg(args.search))
return"["..urlhead..search..""..byabout..""..dname.."]"..tagline
end
-- Determine media string
media=p.mediaTypes(args.media)
ifmedia==""then
me điểu pen="%28"-- added a default mediatype Dec 2015 see p.mediaTypes()
else
me điểu pen="%28"
end
-- Determine date of birth and death
localtemp=mw.text.split(p.bdDate(args.birth,args.death,name),"")
localbirth=temp[1]
localdeath=temp[2]
ifbirth=="Error"ordeath=="Error"then
return"Error in [[:Template:"..tname.."]]: [["..name.."]] doesn't exist."
end
--- Split sname into words and count words
localN=mw.text.split(sname,"")
locall,count=mw.ustring.gsub(sname,"%S+","")
ifcount==0then
return"Error in [[:Template:"..tname.."]]: Zero-word name."
end
--- Date string
ifbirth~="none"anddeath~="none"then
ifp.ia_extendedascii(N[count])==1then
mydate="%20OR%20%28%22"..birth.."-"..death.."%22%20AND%20%28%22"..p.urlX(N[count]).."%22%20OR%20"..p.urlX(p.ia_deaccent(N[count])).."%29%29"
else
mydate="%20OR%20%28%22"..birth.."-"..death.."%22%20AND%20"..p.urlX(N[count]).."%29"
end
end
--- wild string
wild="%29"
ifsopt=="w"andp.ia_extendedascii(sname)==1then
ifp.wildcheck(N,count)==1then
myurl=p.wildfix(N,count)
returnp.IArender()
end
ifcount<3orcount>3then
-- (first last)
wild="%20OR%20%28"..p.ia_url_encode(p.ia_extendedascii2wildcard(sname)).."%29%29"
end
ifcount==3then
-- (first last)
wild="%20OR%20%28"..p.ia_url_encode(p.ia_extendedascii2wildcard(N[1])).."%20"..p.ia_url_encode(p.ia_extendedascii2wildcard(N[3])).."%29%29"
end
end
--[[
Format URL
]]
ifcount==1then
myurl=p.oneWord(sname)
ifsopt=="t"andp.ia_extendedascii(sname)==1then
localplainname=p.ia_deaccent(sname)
localA1="%20OR%20%22"..p.urlX(plainname)
myurl=myurl..A1.."%22"
returnp.IArender()
end
returnp.IArender()
end
ifcount==2then
myurl=p.twoWords(N,sopt)
ifsopt=="t"andp.ia_extendedascii(sname)==1then
localplainname=p.ia_deaccent(sname)
localPN=mw.text.split(plainname,"")
-- Last, First
localA1="%20OR%20%22"..p.urlX(PN[2]).."%2C%20"..p.urlX(PN[1])
-- First Last
localA2="%22%20OR%20%22"..p.urlX(PN[1]).."%20"..p.urlX(PN[2])
myurl=myurl..A1..A2.."%22"
returnp.IArender()
end
returnp.IArender()
end
ifcount==3then
myurl=p.threeWords(N,sopt)
ifsopt=="t"andp.ia_extendedascii(sname)==1then
localplainname=p.ia_deaccent(sname)
localPN=mw.text.split(plainname,"")
localFIRST=p.urlX(PN[1])
localMIDDLE=p.urlX(PN[2])
localLAST=p.urlX(PN[3])
localfirstinitialp=p.urlX(p.firstLetter(PN[1]))
localmiddleinitialp=p.urlX(p.firstLetter(PN[2]))
-- First Middle Last
localA1="%20OR%20%22"..FIRST.."%20"..MIDDLE.."%20"..LAST
-- Last, First Middle
localA2="%22%20OR%20%22"..LAST.."%2C%20"..FIRST.."%20"..MIDDLE
-- Last, First M.
localA3="%22%20OR%20%22"..LAST.."%2C%20"..FIRST.."%20"..middleinitialp.."%2E"
-- Last, F. M.
localA4="%22%20OR%20%22"..LAST.."%2C%20"..firstinitialp..".%20"..middleinitialp.."%2E"
localALL=A1..A2..A3..A4.."%22"
myurl=myurl..ALL
returnp.IArender()
end
returnp.IArender()
end
ifcount==4then
myurl=p.fourWords(N,sopt)
ifsopt=="t"andp.ia_extendedascii(sname)==1then
localplainname=p.ia_deaccent(sname)
localPN=mw.text.split(plainname,"")
localFIRST=p.urlX(PN[1])
localSECOND=p.urlX(PN[2])
localTHIRD=p.urlX(PN[3])
localLAST=p.urlX(PN[4])
localfirstinitialp=p.urlX(p.firstLetter(PN[1]))
localsecondinitialp=p.urlX(p.firstLetter(PN[2]))
localthirdinitialp=p.urlX(p.firstLetter(PN[3]))
-- Last, First Second Third
localA1="%20OR%20%22"..LAST.."%2C%20"..FIRST.."%20"..SECOND.."%20"..THIRD
-- First Second Third Last
localA2="%22%20OR%20%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST
-- Last, F. S. T.
localA3="%22%20OR%20%22"..LAST.."%2C%20"..firstinitialp.."%2E%20"..secondinitialp.."%2E%20"..thirdinitialp.."%2E"
localALL=A1..A2..A3.."%22"
myurl=myurl..ALL
returnp.IArender()
end
returnp.IArender()
end
ifcount>4then
myurl=""
ifsopt=="w"andp.ia_extendedascii(sname)==1then
myurl="%28"
end
myurl=myurl.."%28"..p.ia_url_encode(sname)
ifsopt=="w"andp.ia_extendedascii(sname)==1then
myurl=myurl.."%29"
end
ifsopt=="t"andp.ia_extendedascii(sname)==1then
localplainname=p.ia_deaccent(sname)
localA1="%29%20OR%20%28"..p.ia_url_encode(plainname)
myurl=myurl..A1
returnp.IArender()
end
returnp.IArender()
end
return"Unknown error (1). Please check documentation for [[Template:"..tname.."]]"
end
-- Build final output and render
functionp.IArender()
return"["..urlhead..me điểu pen..myurl..wild..mydate..media..""..byabout..""..dname.."]"..tagline
end
functionp.oneWord(sname)
localnameurl=p.ia_url_encode(sname)
localA1="%28subject%3A%22"..nameurl
localA2="%22%20OR%20creator%3A%22"..nameurl
localA3="%22%20OR%20description%3A%22"..nameurl
localA4="%22%20OR%20title%3A%22"..nameurl
returnA1..A2..A3..A4.."%22"
end
functionp.twoWords(N,sopt)
localFIRST=p.urlX(N[1])
localLAST=p.urlX(N[2])
localfirstinitial=p.urlX(p.firstLetter(N[1]))
-- Last, First
localS1="%28subject%3A%22"..LAST.."%2C%20"..FIRST
-- First Last
localS2="%22%20OR%20subject%3A%22"..FIRST.."%20"..LAST
localSALL=S1..S2
-- Last, First
localC1="%22%20OR%20creator%3A%22"..LAST.."%2C%20"..FIRST
-- First Last
localC2="%22%20OR%20creator%3A%22"..FIRST.."%20"..LAST
localCALL=C1..C2
-- First Last
localT1="%22%20OR%20title%3A%22"..FIRST.."%20"..LAST
localTALL=T1
-- Last, First
localD1="%22%20OR%20description%3A%22"..LAST.."%2C%20"..FIRST
-- First Last
localD2="%22%20OR%20description%3A%22"..FIRST.."%20"..LAST
localDALL=D1..D2
ifsopt=="t"orsopt=="tx"then
returnSALL..CALL..TALL..DALL.."%22"
else
-- Last, F.
localC3="%22%20OR%20creator%3A%22"..LAST.."%2C%20"..firstinitial.."%2E"
localCALL=CALL..C3
returnSALL..CALL..TALL..DALL.."%22"
end
end
functionp.threeWords(N,sopt)
-- CAUTION: The following is near the max 2000 character URL limit for most browsers when using long names
-- such as "René-Nicolas Dufriche Desgenettes".
localFIRST=p.urlX(N[1])
localMIDDLE=p.urlX(N[2])
localLAST=p.urlX(N[3])
localfirstinitial=p.urlX(p.firstLetter(N[1]))
localmiddleinitial=p.urlX(p.firstLetter(N[2]))
-- Last, First Middle
localS1="%28subject%3A%22"..LAST.."%2C%20"..FIRST.."%20"..MIDDLE
-- Last, First M.
localS2="%22%20OR%20subject%3A%22"..LAST.."%2C%20"..FIRST.."%20"..middleinitial.."%2E"
-- Last, F. M.
localS3="%22%20OR%20subject%3A%22"..LAST.."%2C%20"..firstinitial.."%2E%20"..middleinitial.."%2E"
-- First Middle Last
localS4="%22%20OR%20subject%3A%22"..FIRST.."%20"..MIDDLE.."%20"..LAST
-- First M. Last
localS5="%22%20OR%20subject%3A%22"..FIRST.."%20"..middleinitial.."%2E%20"..LAST
-- F. M. Last
localS6="%22%20OR%20subject%3A%22"..firstinitial.."%2E%20"..middleinitial.."%2E%20"..LAST
localSALL=S1..S2..S3..S4..S5..S6
-- First Middle Last
localC1="%22%20OR%20creator%3A%22"..FIRST.."%20"..MIDDLE.."%20"..LAST
-- First M. Last
localC2="%22%20OR%20creator%3A%22"..FIRST.."%20"..middleinitial.."%2E%20"..LAST
-- F. M. Last
localC3="%22%20OR%20creator%3A%22"..firstinitial.."%2E%20"..middleinitial.."%2E%20"..LAST
-- F. Middle Last
localC4="%22%20OR%20creator%3A%22"..firstinitial.."%2E%20"..MIDDLE.."%20"..LAST
-- Last, First Middle
localC5="%22%20OR%20creator%3A%22"..LAST.."%2C%20"..FIRST.."%20"..MIDDLE
-- Last, First M.
localC6="%22%20OR%20creator%3A%22"..LAST.."%2C%20"..FIRST.."%20"..middleinitial.."%2E"
-- Last, F. M.
localC7="%22%20OR%20creator%3A%22"..LAST.."%2C%20"..firstinitial.."%2E%20"..middleinitial.."%2E"
-- Last, F. M.
localC8="%22%20OR%20creator%3A%22"..LAST.."%2C%20"..firstinitial.."%2E%20"..MIDDLE
localCALL=C1..C2..C3..C4..C5..C6..C7..C8
-- First Middle Last
localT1="%22%20OR%20title%3A%22"..FIRST.."%20"..MIDDLE.."%20"..LAST
-- First M. Last
localT2="%22%20OR%20title%3A%22"..FIRST.."%20"..middleinitial.."%2E%20"..LAST
-- F. M. Last
localT3="%22%20OR%20title%3A%22"..firstinitial.."%2E%20"..middleinitial.."%2E%20"..LAST
localTALL=T1..T2..T3
-- First Middle Last
localD1="%22%20OR%20description%3A%22"..FIRST.."%20"..MIDDLE.."%20"..LAST
-- First M. Last
localD2="%22%20OR%20description%3A%22"..FIRST.."%20"..middleinitial.."%2E%20"..LAST
-- F. M. Last
localD3="%22%20OR%20description%3A%22"..firstinitial.."%2E%20"..middleinitial.."%2E%20"..LAST
-- Last, First Middle
localD4="%22%20OR%20description%3A%22"..LAST.."%2C%20"..FIRST.."%20"..MIDDLE
-- Last, First M.
localD5="%22%20OR%20description%3A%22"..LAST.."%2C%20"..FIRST.."%20"..middleinitial.."%2E"
localDALL=D1..D2..D3..D4..D5
ifsopt=="t"orsopt=="tx"then
returnSALL..CALL..TALL..DALL.."%22"
else
-- Last, First
localS7="%22%20OR%20subject%3A%22"..LAST.."%2C%20"..FIRST
-- First Last
localS8="%22%20OR%20subject%3A%22"..FIRST.."%20"..LAST
localSALL=SALL..S7..S8
-- First Last
localC9="%22%20OR%20creator%3A%22"..FIRST.."%20"..LAST
-- Last, First
localC10="%22%20OR%20creator%3A%22"..LAST.."%2C%20"..FIRST
localCALL=CALL..C9..C10
-- First Last
localT4="%22%20OR%20title%3A%22"..FIRST.."%20"..LAST
localTALL=TALL..T4
-- First Last
localD6="%22%20OR%20description%3A%22"..FIRST.."%20"..LAST
-- Last, First
localD7="%22%20OR%20description%3A%22"..LAST.."%2C%20"..FIRST
localDALL=DALL..D6..D7
returnSALL..CALL..TALL..DALL.."%22"
end
end
functionp.fourWords(N,sopt)
localFIRST=p.urlX(N[1])
localSECOND=p.urlX(N[2])
localTHIRD=p.urlX(N[3])
localLAST=p.urlX(N[4])
localfirstinitial=p.firstLetter(N[1])
localsecondinitial=p.firstLetter(N[2])
localthirdinitial=p.firstLetter(N[3])
ifsopt=="t"orsopt=="tx"then
-- Last, First Second Third
localS1="%28subject%3A%22"..LAST.."%2C%20"..FIRST.."%20"..SECOND.."%20"..THIRD
-- First Second Third Last
localS2="%22%20OR%20subject%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST
-- Last, First Second Third
localC1="%22%20OR%20creator%3A%22"..LAST.."%2C%20"..FIRST.."%20"..SECOND.."%20"..THIRD
-- First Second Third Last
localC2="%22%20OR%20creator%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST
-- First Second Third Last
localT1="%22%20OR%20title%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST
-- First Second Third Last
localD1="%22%20OR%20description%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST
returnS1..S2..C1..C2..T1..D1.."%22"
end
-- Last, First Second Third
localS1="%28subject%3A%22"..LAST.."%2C%20"..FIRST.."%20"..SECOND.."%20"..THIRD
-- First Second Third Last
localS2="%22%20OR%20subject%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST
-- Last, First Second Third
localC1="%22%20OR%20creator%3A%22"..LAST.."%2C%20"..FIRST.."%20"..SECOND.."%20"..THIRD
-- First Second Third Last
localC2="%22%20OR%20creator%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST
-- Last, F. S. T.
localC3="%22%20OR%20creator%3A%22"..LAST.."%2C%20"..firstinitial.."%2E%20"..secondinitial.."%2E%20"..thirdinitial.."%2E"
-- First Second Third Last
localT1="%22%20OR%20title%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST
-- First Second Third Last
localD1="%22%20OR%20description%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST
returnS1..S2..C1..C2..C3..T1..D1.."%22"
end
-- ElasticSearch speed/resource problems if first letter of first word is "*" wildcard ie. accented letter
-- Build special search in these cases.
-- https:// elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#_wildcards
functionp.wildfix(N,count)
--- Split along "-" and use only first word ie. John-Taylor-Smith becomes John
localNF=mw.text.split(N[1],"-")
localNL=mw.text.split(N[count],"-")
--..but use full name for 1-word names
ifcount==1then
NF[1]=N[1]
NL[1]=N[1]
end
-- ((Fïrst OR First) AND (Lást OR Last))
return"%28%28%22"..NF[1].."%22%20OR%20"..p.ia_deaccent(NF[1]).."%29%20AND%20%28%22"..NL[1].."%22%20OR%20"..p.ia_deaccent(NL[1]).."%29"
end
-- Return 1 if the first letter of first word, or any single-letter word, is extended ascii
functionp.wildcheck(N,count)
locali=0
-- first letter of first word is extended ascii
ifN[1]:byte(1)<32orN[1]:byte(1)>126thenreturn1end
-- any single-letter word that is composed of only extended ascii
whilei<countdo
i=i+1
ifN[i]:len()==1then
ifN[i]:byte(1)<32orN[i]:byte(1)>126thenreturn1end
end
end
return0
end
functiontrimArg(arg)
ifarg==""orarg==nilthen
returnnil
else
returnmw.text.trim(arg)
end
end
functionp.mediaTypes(argsmedia)
-- Added a default mediatype Dec 2015 due to too many false positives in the software mediatype, caused by birth-death dates catching numbers in source codes
localmedia="-mediatype:software"
ifargsmedia~=""andargsmedia~=nilthen
localmedialist=mw.text.split(mw.text.trim(argsmedia),"")
localal,acount=mw.ustring.gsub(mw.text.trim(argsmedia),"%S+","")
locali=0
repeat-- the following could be condensed but repetitive for clarity
i=i+1
if(mw.ustring.lower(medialist[i])=="text"ormw.ustring.lower(medialist[i])=="texts")then
media=media..p.ia_url_encode("OR mediatype:texts")
end
if(mw.ustring.lower(medialist[i])=="audio")then
media=media..p.ia_url_encode("OR mediatype:audio")
end
if(mw.ustring.lower(medialist[i])=="video")then
media=media..p.ia_url_encode("OR mediatype:video")
end
untili==acount
end
media="%29%20AND%20%28"..media.."%29"
returnmedia
end
-- Alt way to get b/d dates via getContent()
functionp.bdDateAlt(argsbirth,argsdeath,name)
localpagetext=nil
localbirth="none"
localdeath="none"
-- Load the page
localt=mw.title.new(name)
if(t.exists)then
pagetext=t:getContent()
end
ifpagetext==nilthen
return"Error"
end
-- Remove false positives
pagetext=mw.ustring.gsub(mw.ustring.gsub(pagetext,"<!--.--->",""),"<nowiki>.-</nowiki>","")
-- "Category:1900 births"
ifargsbirth==""orargsbirth==nilthen
localbirthcheck=mw.ustring.match(pagetext,"%[%[%s-[Cc]ategory:%s-%d+%.?%d*%s-births%s-%]%]")
ifbirthcheck~=nilthen
birth=mw.ustring.match(birthcheck,"%d+%.?%d*")
else
birth="none"
end
else
birth=mw.text.trim(argsbirth)
end
-- "Category:2000 deaths"
ifargsdeath==""orargsdeath==nilthen
localdeathcheck=mw.ustring.match(pagetext,"%[%[%s-[Cc]ategory:%s-%d+%.?%d*%s-deaths%s-%]%]")
ifdeathcheck~=nilthen
death=mw.ustring.match(deathcheck,"%d+%.?%d*")
else
death="none"
end
else
death=mw.text.trim(argsdeath)
end
returnbirth..""..death
end
-- Get b/d dates via Wikidata.
--
functionp.bdDate(argsbirth,argsdeath,name)
localpagetext=nil
localbirth="none"
localdeath="none"
entity=mw.wikibase.getEntityObject()
ifnotentityornotentity.claimsthen
-- Alternative if template not on a page in mainspace. This is needed since Wikidata can only be retrieved
-- for the article where the template is located.
returnp.bdDateAlt(argsbirth,argsdeath,name)
end
-- Note: The below uses formatPropertyValues() to get and format the date from Wikidata.
-- For an alternative method, see sandbox revision dated 5:58 am, 15 October 2014
ifargsbirth==""orargsbirth==nilthen
localbirthtable=entity:formatPropertyValues('P569')
localbirthsplit=mw.text.split(birthtable["value"],"")
locall,count=mw.ustring.gsub(birthtable["value"],"%S+","")
ifcount>0then
ifstring.find(birthsplit[count],"^%d")then
birth=birthsplit[count]
elseifstring.find(birthsplit[count],"BCE")then
birth=birthsplit[count-1]
elseifstring.find(birthsplit[count],"BC")then
birth=birthsplit[count-1]
elseifstring.find(birthsplit[count],"AD")then
birth=birthsplit[count-1]
end
end
else
birth=mw.text.trim(argsbirth)
end
ifargsdeath==""orargsdeath==nilthen
localdeathtable=entity:formatPropertyValues('P570')
localdeathsplit=mw.text.split(deathtable["value"],"")
locall,count=mw.ustring.gsub(deathtable["value"],"%S+","")
ifcount>0then
ifstring.find(deathsplit[count],"^%d")then
death=deathsplit[count]
elseifstring.find(deathsplit[count],"BCE")then
death=deathsplit[count-1]
elseifstring.find(deathsplit[count],"BC")then
death=deathsplit[count-1]
elseifstring.find(deathsplit[count],"AD")then
death=deathsplit[count-1]
end
end
else
death=mw.text.trim(argsdeath)
end
ifbirth=="none"anddeath=="none"then
-- Alternative if Wikidata is missing data
-- return p.bdDateAlt(name)
returnbirth..""..death
else
returnbirth..""..death
end
end
--- URL-encode special characters
--- Note: this function was added later to deal with "&" characters instead of using p.ia_url_encode since
--- that may break existing instances of the template.
functionp.urlX(str)
if(str)then
str=mw.ustring.gsub(str,"&","%%26")
end
returnstr
end
--- URL-encode a string
--- http://lua-users.org/wiki/StringRecipes
---
functionp.ia_url_encode(str)
if(str)then
str=mw.ustring.gsub(str,"\n","\r\n")
str=mw.ustring.gsub(str,"([^%w %-%_%.%~])",
function(c)returnmw.ustring.format("%%%02X",string.byte(c))end)
str=mw.ustring.gsub(str,"","+")
end
returnstr
end
-- Does str contain extended ascii? 1 = yes
functionp.ia_extendedascii(str)
fori=1,str:len()do
if(str:byte(i)>=32andstr:byte(i)<=126)andstr:byte(i)~=39then-- 39 = "'"
--do nothing
else
return1
end
end
return0
end
-- UTF-8 aware replacement for string.sub() which doesn't support UTF-8.
-- Note: Using instead of mw.ustring.sub() which I suspect(?) might be cause of intermittent error, and faster here for first-letter job.
-- Source: prapin @ Stack Overflow http://stackoverflow /questions/13235091/extract-the-first-letter-of-a-utf-8-string-with-lua
functionp.firstLetter(str)
returnstr:match("[%z\1-\127\194-\244][\128-\191]* ")
end
-- Replace all extended ascii characters with wildcard '*'
-- Replace "-" with <space> eg. Pierre-Jean -> Pierre Jean
functionp.ia_extendedascii2wildcard(str)
locals=""
localj=0
localk=0
fori=1,str:len()do
k=str:byte(i)
ifk>=32andk<=126then
-- For list of Lucene special characters needing to be escaped:
-- http://lucene.apache.org/core/4_10_0/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#Escaping_Special_Characters
-- We only worry about - (45) and "(34) since the others are unlikely to appear in a proper name.
-- Also ' (39) since it is sometimes the extended character ’
ifk==34ork==39then
s=s.."*"
elseifk==45then
s=s..""
else
s=s..str:sub(i,i)
end
else
ifj==1then
s=s.."*"
j=2
end
ifj==0thenj=1end
ifj==2thenj=0end
end
end
returns
end
-- Replace accented letters with non-accented equivalent letters
-- Note: this is not a complete list of all possible accented letters. It is
-- all of the accented letters found in the first 10,000 names using
-- the Internet Archive author template.
functionp.ia_deaccent(str)
locals=str
s=mw.ustring.gsub(s,"á","a")
s=mw.ustring.gsub(s,"a︡","a")
s=mw.ustring.gsub(s,"Á","A")
s=mw.ustring.gsub(s,"ă","a")
s=mw.ustring.gsub(s,"â","a")
s=mw.ustring.gsub(s,"æ","ae")
s=mw.ustring.gsub(s,"Æ","AE")
s=mw.ustring.gsub(s,"à","a")
s=mw.ustring.gsub(s,"ā","a")
s=mw.ustring.gsub(s,"Ā","A")
s=mw.ustring.gsub(s,"ą","a")
s=mw.ustring.gsub(s,"å","a")
s=mw.ustring.gsub(s,"Å","A")
s=mw.ustring.gsub(s,"ã","a")
s=mw.ustring.gsub(s,"ä","a")
s=mw.ustring.gsub(s,"Ä","A")
s=mw.ustring.gsub(s,"β","B")
s=mw.ustring.gsub(s,"ć","c")
s=mw.ustring.gsub(s,"č","c")
s=mw.ustring.gsub(s,"Č","C")
s=mw.ustring.gsub(s,"ç","c")
s=mw.ustring.gsub(s,"Ç","C")
s=mw.ustring.gsub(s,"ĉ","c")
s=mw.ustring.gsub(s,"ď","d")
s=mw.ustring.gsub(s,"đ","d")
s=mw.ustring.gsub(s,"é","e")
s=mw.ustring.gsub(s,"É","E")
s=mw.ustring.gsub(s,"ě","e")
s=mw.ustring.gsub(s,"ê","e")
s=mw.ustring.gsub(s,"è","e")
s=mw.ustring.gsub(s,"È","E")
s=mw.ustring.gsub(s,"ε","e")
s=mw.ustring.gsub(s,"ē","e")
s=mw.ustring.gsub(s,"Ē","E")
s=mw.ustring.gsub(s,"ę","e")
s=mw.ustring.gsub(s,"ð","e")
s=mw.ustring.gsub(s,"ë","e")
s=mw.ustring.gsub(s,"Ë","E")
s=mw.ustring.gsub(s,"γ","Y")
s=mw.ustring.gsub(s,"ħ","h")
s=mw.ustring.gsub(s,"i︠a︡","ia")
s=mw.ustring.gsub(s,"í","i")
s=mw.ustring.gsub(s,"i︠","i")
s=mw.ustring.gsub(s,"ĭ","i")
s=mw.ustring.gsub(s,"Í","I")
s=mw.ustring.gsub(s,"î","i")
s=mw.ustring.gsub(s,"Î","I")
s=mw.ustring.gsub(s,"ì","i")
s=mw.ustring.gsub(s,"ī","i")
s=mw.ustring.gsub(s,"ł","i")
s=mw.ustring.gsub(s,"ï","i")
s=mw.ustring.gsub(s,"Ï","I")
s=mw.ustring.gsub(s,"ĺ","I")
s=mw.ustring.gsub(s,"Ĺ","L")
s=mw.ustring.gsub(s,"μ","u")
s=mw.ustring.gsub(s,"µ","u")
s=mw.ustring.gsub(s,"ń","n")
s=mw.ustring.gsub(s,"ň","n")
s=mw.ustring.gsub(s,"ņ","n")
s=mw.ustring.gsub(s,"ñ","n")
s=mw.ustring.gsub(s,"Ñ","N")
s=mw.ustring.gsub(s,"ó","o")
s=mw.ustring.gsub(s,"Ó","O")
s=mw.ustring.gsub(s,"ô","o")
s=mw.ustring.gsub(s,"œ","oe")
s=mw.ustring.gsub(s,"ò","o")
s=mw.ustring.gsub(s,"ō","o")
s=mw.ustring.gsub(s,"ø","o")
s=mw.ustring.gsub(s,"Ø","o")
s=mw.ustring.gsub(s,"õ","o")
s=mw.ustring.gsub(s,"ö","o")
s=mw.ustring.gsub(s,"ő","o")
s=mw.ustring.gsub(s,"Ö","O")
s=mw.ustring.gsub(s,"φ","o")
s=mw.ustring.gsub(s,"ŕ","r")
s=mw.ustring.gsub(s,"ř","r")
s=mw.ustring.gsub(s,"Ř","R")
s=mw.ustring.gsub(s,"ß","ss")
s=mw.ustring.gsub(s,"ś","s")
s=mw.ustring.gsub(s,"Ś","S")
s=mw.ustring.gsub(s,"š","s")
s=mw.ustring.gsub(s,"ṣ","s")
s=mw.ustring.gsub(s,"Š","S")
s=mw.ustring.gsub(s,"ş","s")
s=mw.ustring.gsub(s,"Ş","S")
s=mw.ustring.gsub(s,"ŝ","s")
s=mw.ustring.gsub(s,"σ","s")
s=mw.ustring.gsub(s,"ť","t")
s=mw.ustring.gsub(s,"ţ","t")
s=mw.ustring.gsub(s,"τ","t")
s=mw.ustring.gsub(s,"þ","p")
s=mw.ustring.gsub(s,"Þ","p")
s=mw.ustring.gsub(s,"ú","u")
s=mw.ustring.gsub(s,"Ú","U")
s=mw.ustring.gsub(s,"û","u")
s=mw.ustring.gsub(s,"ù","u")
s=mw.ustring.gsub(s,"ū","u")
s=mw.ustring.gsub(s,"ů","u")
s=mw.ustring.gsub(s,"ü","u")
s=mw.ustring.gsub(s,"Ü","U")
s=mw.ustring.gsub(s,"ŵ","w")
s=mw.ustring.gsub(s,"ý","y")
s=mw.ustring.gsub(s,"ŷ","y")
s=mw.ustring.gsub(s,"¥","y")
s=mw.ustring.gsub(s,"ÿ","y")
s=mw.ustring.gsub(s,"Ÿ","Y")
s=mw.ustring.gsub(s,"ź","z")
s=mw.ustring.gsub(s,"Ž","Z")
s=mw.ustring.gsub(s,"ž","z")
s=mw.ustring.gsub(s,"ż","z")
s=mw.ustring.gsub(s,"Ż","Z")
returns
end
returnp