User:JVbot/periodicalbot.py
Jump to navigation
Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This script creates Wikidata claims from enwp:Template:Infobox journal and Infobox magazine.
Pages to work on can be specified using any of:
¶ms;
"""
#
# (C) John Vandenberg, 2014
#
# Distributed under the terms of the MIT license.
#
import time # sleep
import stdnum.issn as stdnum_issn
import pywikibot
from pywikibot import pagegenerators
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp,
}
def getInfobox(templates,infobox_type = None):
for (inPageTemplate, params) in templates:
if (not infobox_type and ':infobox' in inPageTemplate.title().lower()) or infobox_type and ((':infobox ' + infobox_type in inPageTemplate.title().lower()) or (':info/' + infobox_type in inPageTemplate.title().lower()) or (':ficha de ' + infobox_type in inPageTemplate.title().lower()) or (':' + infobox_type in inPageTemplate.title().lower())):
if ':infobox ' in inPageTemplate.title().lower():
page_infobox_type = inPageTemplate.title().lower().split(':')[1][len('infobox '):]
elif ':ficha de ' in inPageTemplate.title().lower():
page_infobox_type = inPageTemplate.title().lower().split(':')[1][len('ficha de '):]
elif ':info/' in inPageTemplate.title().lower(): # pt
page_infobox_type = inPageTemplate.title().lower().split(':')[1][len('info/'):]
elif ':'+infobox_type in inPageTemplate.title().lower(): # other
page_infobox_type = inPageTemplate.title().lower().split(':')[1]
params.append('infobox_type='+page_infobox_type)
return params
print 'Templates:'
seen = []
for (inPageTemplate, params) in templates:
if inPageTemplate.title() not in seen:
print inPageTemplate.title()
seen.append(inPageTemplate.title())
def getInfoboxField(infobox, field):
for param in infobox:
if param.lower().startswith(field.lower()+'='):
return param[param.find('=')+1:]
def loadSources(family=None,language_codes=None,repo=None):
"""
Fetches the sources from the onwiki list
and stores it internally
"""
import json
if not repo:
repo = pywikibot.Site('wikidata','wikidata')
print 'Fetching wiki site items'
page = pywikibot.Page(repo, u'Wikidata:List of wikis/python')
# TODO: cache page
source_values = json.loads(page.get())
if family:
source_values = source_values[family]
#for source_lang in source_values:
# if not language_codes or source_lang in language_codes:
# source_values[source_lang] = pywikibot.ItemPage(repo, source_values[source_lang])
# Todo - if all requested langauges are not in array; raise exception
#raise Exception('Unsupported source language %s' % lang)
return source_values
def loadClaims(filename, property_id):
claims = {}
f = open(filename, 'rb')
property_id = str(property_id)
for line in f:
#print line
q, p, v = line.strip().split("\t")
#print "%s, %s, %s" % (q, p, v)
if p.strip() == property_id:
v = v.strip()[1:]
q = 'Q'+q.strip()
if v not in claims:
claims[v] = []
claims[v].append(q)
#print 'Added ' + q
f.close()
return claims
def loadISSNs(filename):
# file format is same as parse_xml_bz2.php comes from https://bitbucket.org/magnusmanske/wikidataquery
# can be obtained via : cat dump.xml | parse_xml_bz2.php | grep '[[:space:]]236[[:space:]]'
return loadClaims(filename, 236)
def main():
args = pywikibot.handleArgs()
to_q = None
testsitename = False
verbose = False
force = False
infobox_type = 'journal'
gen_args = []
genFactory = pagegenerators.GeneratorFactory()
for arg in args:
if genFactory.handleArg(arg):
gen_args.append(arg)
continue
elif arg.startswith('-test:'):
testsitename = arg[len('-test:'):]
elif arg.startswith('-f'):
force = True
elif arg.startswith('-type'):
infobox_type = arg[len('-type:'):]
if infobox_type.lower().replace('_',' ') == u'presse ecrite':
infobox_type = u'presse écrite'
elif arg.startswith('-q'):
to_q = arg[len('-q:'):]
force = True
else:
raise Exception('Unknown argument')
site = pywikibot.getSite()
if testsitename:
(test_lang,test_family) = testsitename.split('.')
datasite = pywikibot.getSite(test_lang,test_family)
if datasite:
print "Using %s" % datasite.sitename()
else:
raise Exception('Failed to get a test site to play with')
datasite = datasite.data_repository()
else:
datasite = site.data_repository()
gen = genFactory.getCombinedGenerator()
if not gen:
raise Exception('Target pages not specified')
gen_args = ' '.join(gen_args)
if infobox_type == 'journal' or infobox_type == 'revue' or infobox_type == u'revista científica' or infobox_type == 'akademisk tidskrift':
instance_of_qid = '5633421' # scientific journal
elif infobox_type == 'magazine':
instance_of_qid = '41298' # magazine
elif infobox_type == 'newspaper':
instance_of_qid = '11032' # newspaper
elif infobox_type == 'publikation' or infobox_type == 'tijdschrift' or infobox_type == 'revista' or infobox_type == u'presse écrite':
instance_of_qid = '1092563' # 'periodical literature'
else:
raise Exception('Unknown type %s' % infobox_type)
lang = site.language()
enwp_qid = source_wp_qid = '328' # English Wikipedia
if lang != 'en':
sources = loadSources('wikipedia',language_codes=[lang],repo=datasite)
source_wp_qid = sources[lang][1:]
issns = loadISSNs('dumps/issn_claims.tab')
print "loaded %d issns" % len(issns)
for page in gen:
#pywikibot.output(u"Initialising %s ..." % page.title() )
item = None
wp_qid = source_wp_qid
infobox_type_req = None #get the first
if force:
infobox_type_req = infobox_type # find the right one
if to_q:
if to_q != '-1':
item = pywikibot.ItemPage(datasite,'Q'+to_q)
else:
item = pywikibot.ItemPage.fromPage(page)
if item:
if not item.exists():
pywikibot.output(u"%s does not already exist in Wikidata." % page.title() )
item = None
else:
try:
item.get()
except:
pywikibot.output(u"Failed loading %s item %s, Skipping." % (page.title(), item.title()) )
continue
if item:
if 'P357' in item.claims and 'P31' in item.claims and 'P236' in item.claims:
if verbose:
pywikibot.output(u"%s already has the necessary claims..." % page.title() )
continue
if lang == 'zh' and not force:
if item.sitelinks:
if 'enwiki' not in item.sitelinks:
other_langs = item.sitelinks.keys()
other_langs.remove(lang+'wiki')
if len(other_langs) == 0:
print u"%s doesnt exist on any other wikipedia..." % page.title()
elif len(other_langs) == 1:
print u"%s also exists on %s.wikipedia..." % (page.title(), other_langs[0][:2])
else:
print u"%s doesnt exist on enwiki; it does exist on: %s" % (page.title(), u','.join(other_langs))
continue
else:
print u"Loading enwiki %s for %s ..." % (item.sitelinks['enwiki'], page.title() )
page = pywikibot.Page( pywikibot.Site('en','wikipedia'), item.sitelinks['enwiki'])
wp_qid = enwp_qid
infobox_type_req = None
# TODO: also change all uses of 'lang' to en
else:
raise Exception('Items without any sitelinks cant be processed yet')
try:
infobox = getInfobox(page.templatesWithParams(), infobox_type_req)
except:
pywikibot.output(u"Failed to load %s. Sleeping ..." % page.title() )
time.sleep(3)
try:
infobox = getInfobox(page.templatesWithParams(), infobox_type_req)
except:
pywikibot.output(u"Failed to load %s again. Sleeping & skipping ..." % page.title() )
time.sleep(3)
continue
if not infobox:
print 'Page %s doesnt have an infobox; skipping' % page.title()
continue
page_infobox_type = abbr = getInfoboxField(infobox, 'infobox_type')
if page_infobox_type != infobox_type:
print 'Page %s first infobox is of type %s instead of %s; skipping' % (page.title(),page_infobox_type,infobox_type)
continue
title = None
subtitle = None
if lang == 'en' or lang == 'zh':
title = getInfoboxField(infobox, 'title')
if not title:
title = getInfoboxField(infobox, 'name')
elif lang == 'de' or lang == 'nl' or lang == 'sv':
title = getInfoboxField(infobox, 'titel')
elif lang == 'fr':
title = getInfoboxField(infobox, 'titre')
if not title:
title = getInfoboxField(infobox, 'nom') #infobox Presse ecrite
elif lang == 'pt' or lang == 'es':
title = getInfoboxField(infobox, 'título')
else:
raise Exception('Unsupported title language %s' % lang)
issn = getInfoboxField(infobox, 'ISSN')
eissn = getInfoboxField(infobox, 'eISSN')
if lang == 'fr' and not eissn and infobox_type == u'presse écrite':
eissn = getInfoboxField(infobox, u'ISSN électronique')
if eissn == issn:
eissn = None
if getInfoboxField(infobox, 'ISSN2'):
print 'multiple ISSNs indicating many parts, skipping'
continue
try:
if issn:
if len(issn.strip()) > 9:
print 'trimming %s' % issn
issn = issn.strip()[0:9]
#issn = issn.replace(u' ','-')
if not stdnum_issn.is_valid(issn):
print 'Page %s has invalid ISSN: %s' % (page.title(), issn)
issn = None
elif stdnum_issn.format(issn) != issn:
print 'Page %s ISSN %s reformated to %s' % (page.title(), issn,stdnum_issn.format(issn))
issn = stdnum_issn.format(issn)
if eissn:
if not stdnum_issn.is_valid(eissn):
print 'Page %s has invalid eISSN: %s' % (page.title(), eissn)
eissn = None
elif stdnum_issn.format(eissn) != eissn:
print 'Page %s eISSN %s reformated to %s' % (page.title(), eissn,stdnum_issn.format(eissn))
eissn = stdnum_issn.format(eissn)
except:
print 'Failure on page %s during issn checking for %s and %s' % (page.title(), issn, eissn)
continue
if issn:
if issn in issns:
issn_qs = issns[issn]
print 'Page %s has an issn of %s, which is currently registered to %s' % (page.title(), issn, ','.join(issn_qs))
if not item or item.title() not in issn_qs:
print '... which is not linked to the page'
continue
else:
print 'Page %s ISSN %s is not in Wikidata' % (page.title(), issn)
if eissn:
if eissn in issns:
issn_qs = issns[eissn]
print 'Page %s has an issn of %s, which is currently registered to %s' % (page.title(), eissn, ','.join(issn_qs))
if not item or item.title() not in issn_qs:
print '... which is not linked to the page'
continue
else:
print 'Page %s ISSN %s is not in Wikidata' % (page.title(), eissn)
if item and 'P357' in item.claims and 'P31' in item.claims and ('P236' in item.claims or (not issn and not eissn)):
print 'Page %s doesnt have any metadata to be added' % page.title()
continue
if not title:
if infobox_type in page.title().lower():
print 'Page %s doesnt have "title" param, but page title confirms type' % page.title()
elif issn or eissn:
print 'Page %s doesnt have "title" param' % page.title()
print '...defaulting to page title; double check this'
else:
print 'Page %s doesnt have "title" param and cant be verified, .. skipping' % page.title()
continue
title = page.title()
if title.endswith(' ('+infobox_type+')'):
title = title.split(' ('+infobox_type+')')[0]
if '<br' in title: # <br>, <br/>, etc
print "trimming title %s" % title
title = title[0:title.find('<br')]
# This is occasionally used for long titles on enwp
if lang == 'en':
title = title.replace('{{no wrap|','').replace('{{nowrap|','').replace('}}','').replace("''",'').replace('|','|')
# frwp uses this
elif lang == 'fr':
# get the first language only
title = title.split('}}')[0].replace('{{Lang|','{{lang|').replace('{{lang|en|','').replace('{{lang|de|','').replace('{{lang|fr|','').replace('{{lang|la|','').replace('{{lang|es|','').replace('{{lang|el|','').replace('{{lang|it|','').replace('{{lang|pt|','').replace('{{lang|cr|','').replace('texte=','').replace("''",'')
if ': ' in title:
subtitle = title[title.find(': ')+2:].strip()
title = title.split(': ')[0].strip() # strip to fix 'abc : def'
if ' - ' in title:
(title,subtitle) = title.split(' - ')
subtitle = subtitle.strip()
# Sometimes the periodical infobox is on an article about the organisation - detect this, or other title mismatches
if not force and (title.replace('The ','').replace('La ','').lower().replace('.','').replace(' and ',' & ').replace(u'’',"'") not in page.title().lower().replace('.','').replace(' and ',' & ').replace(u'’',"'")):
if item and ('en' not in item.labels or (title not in item.labels['en'] and title not in item.labels[lang])) and ('en' not in item.aliases or title not in item.aliases['en']):
print "Infobox title %s is not in the page title %s or item label or aliases; possibly an organisation with a periodical, or a periodical series" % (title,page.title())
continue
# Special cases to avoid
if lang != 'en' and title == 'Proceedings of the Royal Society':
continue
if lang == 'zh' and not force:
# By default, this bot expects a human to review the above output
# and check for duplicates before forcing the bot to proceed
continue
if not item:
if to_q=='-1':
if not issn and not eissn:
pywikibot.output(u"Skipping creation of item from %s as it doesnt have any unique id (ISSN, etc)" % page.title())
continue
pywikibot.output(u"Creating item from %s" % page.title())
else:
pywikibot.output(u"Skipping creation of item from %s; add -q=-1 to create" % page.title())
continue
else:
pywikibot.output(u"Adding claims from page %s to %s" % (page.title(),item.title()) )
# TODO: default to putting labels in the language of the source item
# put 'Back To The Roots' in labels[en]
#if item and ('en' not in item.labels or (title not in item.labels['en'] and title not in item.labels[lang]))
if not item:
data = {'labels': {'en': {'language': 'en', 'value': title}},
'descriptions': {'en': {'language': 'en', 'value': infobox_type}},
'sitelinks': {'enwiki': {'site': 'enwiki', 'title': page.title()}},
'claims':[]
}
else:
if 'P357' in item.claims:
print 'Item %s already has "title" claim' % item.title()
if 'P31' in item.claims:
print 'Item %s already has "instance of" claims' % item.title()
if 'P236' in item.claims:
print 'Item %s already has ISSN claims' % item.title()
elif not issn and not eissn:
print 'Page %s doesnt have any "ISSN" params' % page.title()
data = {'claims':[]}
source_snak = {"snaks":{"P143":[
{"snaktype":"value","property":"P143",
"datavalue":{"value":{"entity-type": "item","numeric-id":wp_qid},"type":"wikibase-entityid"}
}
]}}
if not item or (item and 'P357' not in item.claims): # title
data['claims'].append({"mainsnak":{"snaktype":"value","property":"P357","datavalue":{"value":title,"type":"string"}}, "type":"statement", "rank":"normal", "references": [source_snak]})
if subtitle and (not item or (item and 'P392' not in item.claims)): # subtitle
data['claims'].append({"mainsnak":{"snaktype":"value","property":"P392","datavalue":{"value":subtitle,"type":"string"}}, "type":"statement", "rank":"normal", "references": [source_snak]})
if not item or (item and 'P31' not in item.claims): # instance of
data['claims'].append({"mainsnak":{"snaktype":"value","property":"P31","datavalue":{"value":{"entity-type": "item","numeric-id":instance_of_qid},"type":"wikibase-entityid"}}, "type":"statement", "rank":"normal", "references": [source_snak]})
if issn and (not item or (item and 'P392' not in item.claims)): # issn
data['claims'].append({"mainsnak":{"snaktype":"value","property":"P236","datavalue":{"value":issn,"type":"string"}}, "type":"statement", "rank":"normal", "references": [source_snak]})
if eissn and (not item or (item and 'P392' not in item.claims)): # eissn
data['claims'].append({"mainsnak":{"snaktype":"value","property":"P236","datavalue":{"value":eissn,"type":"string"}}, "type":"statement", "rank":"normal", "references": [source_snak]})
if not item:
item = pywikibot.ItemPage(datasite)
#print data
try:
item.editEntity(data)
except:
pywikibot.output(u"Failed to save data for %s. Sleeping ..." % page.title() )
time.sleep(3)
try:
item.editEntity(data)
except:
pywikibot.output(u"Failed to save data for %s again. Sleeping & skipping ..." % page.title() )
time.sleep(3)
continue
if __name__ == "__main__":
try:
main()
finally:
pywikibot.stopme()