User:Sz-iwbot/hanzi.py
外观
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from BeautifulSoup import BeautifulSoup # For processing HTML
import sys, os, chardet
import wikipedia
import urllib2,time,random, urllib
#修正韩文编码问题
reload(sys)
sys.setdefaultencoding('utf-8')
site = wikipedia.getSite()
sitezh = wikipedia.getSite(code='zh',fam='wikipedia')
siteja = wikipedia.getSite(code='ja',fam='wikipedia')
siteko = wikipedia.getSite(code='ko',fam='wikipedia')
sitevi = wikipedia.getSite(code='vi',fam='wikipedia')
def remove_dups(lst):
""" Removes duplicate elements from list. Drawbacks:
- Returns an unsorted list.
- Does not work with lists, dicts etc. as list elements.
"""
dick = {}
for item in lst:
dick[item] = None
return dick.keys()
def GetPage(codepoint):
"""获取网页正文"""
baseurl = u'http://www.unicode.org/cgi-bin/GetUnihanData.pl?codepoint=%s'
url = baseurl % codepoint
u = urllib2.urlopen(url)
return u.read()
try:
try:
log=open('unihan.dat')
unibegin=int(log.read())
finally:
log.close()
except:
unibegin=int(0x4E00)
uniend=int(0x9FFF+1)
for x in range(unibegin, uniend):
codepoint=hex(x)[2:]
#获取网页内容
html=GetPage(codepoint)
#print isinstance(html, unicode)
utfcheck= chardet.detect(html)['encoding']
#格式化网页
soup = BeautifulSoup(html)
#print isinstance(str(soup), unicode), type(str(soup)),isinstance(str(soup), str), chardet.detect(str(soup))
#soup = soup.__str__("utf-8")
#只得到需要的部分
blockquote = soup.blockquote
#该unicode的汉字
if utfcheck == 'utf-8':
hanzi= blockquote.find(text='Glyphs').findNext('table').font.contents[0]
else:
hanzi=wikipedia.html2unicode('&#x'+codepoint+';')
print hanzi
#各种编码
Encoding = blockquote.find(text='Encoding Forms').findNext('table').findAll('td')
Decimal = Encoding[0].string
UTF8 = Encoding[1].string
UTF16 = Encoding[2].string
UTF32 = Encoding[3].string
#各种IRG(表意文字小組)
try:
IRG = blockquote.find(text='IRG Sources').findNext('table')
except :
IRG=''
#國際表意文字核心
try:
kIICore = IRG.find(text='kIICore').findNext('code').string
except :
kIICore = ''
#国标来源? GB 2312等(16进制?)
try:
kIRG_GSource = IRG.find(text='kIRG_GSource').findNext('code').string
except :
kIRG_GSource = ''
#香港增補字符集(16进制?)
try:
kIRG_HSource = IRG.find(text='kIRG_HSource').findNext('code').string
except :
kIRG_HSource = ''
#日本的多个来源(16进制?)
try:
kIRG_JSource = IRG.find(text='kIRG_JSource').findNext('code').string
except :
kIRG_JSource = ''
#朝鲜
try:
kIRG_KPSource = IRG.find(text='kIRG_KPSource').findNext('code').string
except :
kIRG_KPSource = ''
#韩国的
try:
kIRG_KSource = IRG.find(text='kIRG_KSource').findNext('code').string
except :
kIRG_KSource = ''
#台湾的
try:
kIRG_TSource = IRG.find(text='kIRG_TSource').findNext('code').string
except :
kIRG_TSource = ''
# http://unicode.org/reports/tr45/#UAX38
try:
kIRG_USource = IRG.find(text='kIRG_USource').findNext('code').string
except :
kIRG_USource = ''
#越南的
try:
kIRG_VSource = IRG.find(text='kIRG_VSource').findNext('code').string
except :
kIRG_VSource = ''
#澳門資訊系統字集
try:
kIRG_MSource = IRG.find(text='kIRG_MSource').findNext('code').string
except :
kIRG_MSource = ''
#各种字典
try:
Dictionary = blockquote.find(text='Dictionary Indices').findNext('table')
except :
Dictionary = ''
#以汉字写粤语 中国语言学报专著系列
try:
kCheungBauerIndex = Dictionary.find(text='kCheungBauerIndex').findNext('code').string
except :
kCheungBauerIndex = ''
#廣州話袖珍字典
try:
kCowles = Dictionary.find(text='kCowles').findNext('code').string
except :
kCowles = ''
#Dae Jaweon (Korean) dictionary(大字源?)
try:
kDaeJaweon = Dictionary.find(text='kDaeJaweon').findNext('code').string
except :
kDaeJaweon = ''
#Fenn's Chinese-English Pocket Dictionary《五千词典》
try:
kFennIndex = Dictionary.find(text='kFennIndex').findNext('code').string
except :
kFennIndex = ''
#漢文典
try:
kGSR = Dictionary.find(text='kGSR').findNext('code').string
except :
kGSR = ''
#漢語大字典
try:
kHanYu = Dictionary.find(text='kHanYu').findNext('code').string
except :
kHanYu = ''
#大字源
try:
kIRGDaeJaweon = Dictionary.find(text='kIRGDaeJaweon').findNext('code').string
except :
kIRGDaeJaweon = ''
#大汉和辞典
try:
kIRGDaiKanwaZiten = Dictionary.find(text='kIRGDaiKanwaZiten').findNext('code').string
except :
kIRGDaiKanwaZiten = ''
#汉语大字典
try:
kIRGHanyuDaZidian = Dictionary.find(text='kIRGHanyuDaZidian').findNext('code').string
except :
kIRGHanyuDaZidian = ''
#康熙字典
try:
kIRGKangXi = Dictionary.find(text='kIRGKangXi').findNext('code').string
except :
kIRGKangXi = ''
#康熙字典
try:
kKangXi = Dictionary.find(text='kKangXi').findNext('code').string
except :
kKangXi = ''
#中日汉字分析字典
try:
kKarlgren = Dictionary.find(text='kKarlgren').findNext('code').string
except :
kKarlgren = ''
#實用粵英辭典
try:
kLau = Dictionary.find(text='kLau').findNext('code').string
except :
kLau = ''
#麥氏漢英辭典
try:
kMatthews = Dictionary.find(text='kMatthews').findNext('code').string
except :
kMatthews = ''
#学生用粤英字典
try:
kMeyerWempe = Dictionary.find(text='kMeyerWempe').findNext('code').string
except :
kMeyerWempe = ''
#大汉和辞典
try:
kMorohashi = Dictionary.find(text='kMorohashi').findNext('code').string
except :
kMorohashi = ''
#最新漢英辞典
try:
kNelson = Dictionary.find(text='kNelson').findNext('code').string
except :
kNelson = ''
#宋本廣韻
try:
kSBGY = Dictionary.find(text='kSBGY').findNext('code').string
except :
kSBGY = ''
#其他汉字检索信息
try:
DictionaryLikeData = blockquote.find(text='Dictionary-like Data').findNext('table')
except :
DictionaryLikeData = ''
#仓颉输入法
try:
kCangjie = DictionaryLikeData.find(text='kCangjie').findNext('code').string
except :
kCangjie = ''
#以汉字写粤语 中国语言学报专著系列
try:
kCheungBauer = DictionaryLikeData.find(text='kCheungBauer').findNext('code').string
except :
kCheungBauer = ''
#辭海
try:
kCihaiT = DictionaryLikeData.find(text='kCihaiT').findNext('code').string
except :
kCihaiT = ''
#《五千词典》
try:
kFenn = DictionaryLikeData.find(text='kFenn').findNext('code').string
except :
kFenn = ''
#四角号码
try:
kFourCornerCode = DictionaryLikeData.find(text='kFourCornerCode').findNext('code').string
except :
kFourCornerCode = ''
#繁体中文USENET帖子词频分析
try:
kFrequency = DictionaryLikeData.find(text='kFrequency').findNext('code').string
except :
kFrequency = ''
#朗文初級中文詞典
try:
kGradeLevel = DictionaryLikeData.find(text='kGradeLevel').findNext('code').string
except :
kGradeLevel = ''
#漢語大字典
try:
kHDZRadBreak = DictionaryLikeData.find(text='kHDZRadBreak').findNext('code').string
except :
kHDZRadBreak = ''
#常用字字形表 (二零零零年修訂本),香港: 香港教育學院, 2000, ISBN 962-949-040-4.
try:
kHKGlyph = DictionaryLikeData.find(text='kHKGlyph').findNext('code').string
except :
kHKGlyph = ''
#“Ten Thousand Characters: An Analytic Dictionary”
try:
kPhonetic = DictionaryLikeData.find(text='kPhonetic').findNext('code').string
except :
kPhonetic = ''
#笔画
try:
kTotalStrokes = DictionaryLikeData.find(text='kTotalStrokes').findNext('code').string
except :
kTotalStrokes = ''
#数字,只有一个
try:
NumericValues = blockquote.find(text='Numeric Values').findNext('table')
except :
NumericValues = ''
#大写数字/会计数字
try:
kAccountingNumeric = NumericValues.find(text='kAccountingNumeric').findNext('code').string
except :
kAccountingNumeric = ''
#特别的数字
try:
kOtherNumeric = NumericValues.find(text='kOtherNumeric').findNext('code').string
except :
kOtherNumeric = ''
#对应的数字
try:
kPrimaryNumeric = NumericValues.find(text='kPrimaryNumeric').findNext('code').string
except :
kPrimaryNumeric = ''
print kAccountingNumeric, kOtherNumeric, kPrimaryNumeric
#编码
try:
OtherMappings = blockquote.find(text='Other Mappings').findNext('table')
except :
OtherMappings = ''
#Big5
try:
kBigFive = OtherMappings.find(text='kBigFive').findNext('code').string
except :
kBigFive = ''
#中文資訊交換碼(Chinese Character Code for Information Interchange,簡稱CCCII)
try:
kCCCII = OtherMappings.find(text='kCCCII').findNext('code').string
except :
kCCCII = ''
#台灣CNS11643-1986漢字標準(國家標準中文交換碼)
try:
kCNS1986 = OtherMappings.find(text='kCNS1986').findNext('code').string
except :
kCNS1986 = ''
#台灣CNS11643-1992漢字標準(國家標準中文交換碼)
try:
kCNS1992 = OtherMappings.find(text='kCNS1992').findNext('code').string
except :
kCNS1992 = ''
#东亚字码 EACC
try:
kEACC = OtherMappings.find(text='kEACC').findNext('code').string
except :
kEACC = ''
#GB 2312-80
try:
kGB0 = OtherMappings.find(text='kGB0').findNext('code').string
except :
kGB0 = ''
#GB 12345-90
try:
kGB1 = OtherMappings.find(text='kGB1').findNext('code').string
except :
kGB1 = ''
#GB 7589-87
try:
kGB3 = OtherMappings.find(text='kGB3').findNext('code').string
except :
kGB3 = ''
#GB 7590-87
try:
kGB5 = OtherMappings.find(text='kGB5').findNext('code').string
except :
kGB5 = ''
#GB 8565-89
try:
kGB7 = OtherMappings.find(text='kGB7').findNext('code').string
except :
kGB7 = ''
#GB 8565-89
try:
kGB8 = OtherMappings.find(text='kGB8').findNext('code').string
except :
kGB8 = ''
#香港增補字符集
try:
kHKSCS = OtherMappings.find(text='kHKSCS').findNext('code').string
except :
kHKSCS = ''
#IBM 日文编码
try:
kIBMJapan = OtherMappings.find(text='kIBMJapan').findNext('code').string
except :
kIBMJapan = ''
#JIS X 0208-1990 Code of the Japanese Graphic Character Set for Information Interchange
try:
kJis0 = OtherMappings.find(text='kJis0').findNext('code').string
except :
kJis0 = ''
#JIS X 0212-1990
try:
kJis1 = OtherMappings.find(text='kJis1').findNext('code').string
except :
kJis1 = ''
#JIS X 0213-2000
try:
kJIS0213 = OtherMappings.find(text='kJIS0213').findNext('code').string
except :
kJIS0213 = ''
#KPS 9566-97
try:
kKPS0 = OtherMappings.find(text='kKPS0').findNext('code').string
except :
kKPS0 = ''
#KPS 10721-2000
try:
kKPS1 = OtherMappings.find(text='kKPS1').findNext('code').string
except :
kKPS1 = ''
#KS X 1001:1992 (KS C 5601-1989)
try:
kKSC0 = OtherMappings.find(text='kKSC0').findNext('code').string
except :
kKSC0 = ''
#KS X 1002:1991 (KS C 5657-1991)
try:
kKSC1 = OtherMappings.find(text='kKSC1').findNext('code').string
except :
kKSC1 = ''
#PRC电报吗
try:
kMainlandTelegraph = OtherMappings.find(text='kMainlandTelegraph').findNext('code').string
except :
kMainlandTelegraph = ''
#伪GB 12345-90
try:
kPseudoGB1 = OtherMappings.find(text='kPseudoGB1').findNext('code').string
except :
kPseudoGB1 = ''
#台湾电报
try:
kTaiwanTelegraph = OtherMappings.find(text='kTaiwanTelegraph').findNext('code').string
except :
kTaiwanTelegraph = ''
#Xerox code
try:
kXerox = OtherMappings.find(text='kXerox').findNext('code').string
except :
kXerox = ''
#部首
try:
RadicalStrokeCounts = blockquote.find(text='Radical-stroke Indices').findNext('table')
except :
RadicalStrokeCounts = ''
#Adobe-Japan1-6
try:
kRSAdobe_Japan1_6 = RadicalStrokeCounts.find(text='kRSAdobe_Japan1_6').findNext('code').string
except :
kRSAdobe_Japan1_6 = ''
#日文部首
try:
kRSJapanese = RadicalStrokeCounts.find(text='kRSJapanese').findNext('code').string
except :
kRSJapanese = ''
#康熙字典
try:
kRSKangXi = RadicalStrokeCounts.find(text='kRSKangXi').findNext('code').string
except :
kRSKangXi = ''
#大漢和辞典
try:
kRSKanWa = RadicalStrokeCounts.find(text='kRSKanWa').findNext('code').string
except :
kRSKanWa = ''
#韩文部首
try:
kRSKorean = RadicalStrokeCounts.find(text='kRSKorean').findNext('code').string
except :
kRSKorean = ''
#Unicode部首
try:
kRSUnicode = RadicalStrokeCounts.find(text='kRSUnicode').findNext('code').string
except :
kRSUnicode = ''
#读音
try:
Readings = blockquote.find(text='Readings').findNext('table')
except :
Readings = ''
#香港語言學學會粵語拼音方案,簡稱粵拼
try:
kCantonese = Readings.find(text='kCantonese').findNext('code').string
except :
kCantonese = ''
#英语翻译
try:
kDefinition = Readings.find(text='kDefinition').findNext('code').string
except :
kDefinition = ''
#諺文
try:
kHangul = Readings.find(text='kHangul').findNext('code').string
except :
kHangul = ''
#汉语拼音与词频
try:
kHanyuPinlu = Readings.find(text='kHanyuPinlu').findNext('code').string
except :
kHanyuPinlu = ''
#漢語拼音
try:
kHanyuPinyin = Readings.find(text='kHanyuPinyin').findNext('code').string
except :
kHanyuPinyin = ''
#日文訓讀
try:
kJapaneseKun = Readings.find(text='kJapaneseKun').findNext('code').string
except :
kJapaneseKun = ''
#日文音讀
try:
kJapaneseOn = Readings.find(text='kJapaneseOn').findNext('code').string
except :
kJapaneseOn = ''
#韩语发音(耶鲁系统)
try:
kKorean = Readings.find(text='kKorean').findNext('code').string
except :
kKorean = ''
#普通话
try:
kMandarin = Readings.find(text='kMandarin').findNext('code').string
except :
kMandarin = ''
#唐朝时的读音(不是唐音)
try:
kTang = Readings.find(text='kTang').findNext('code').string
except :
kTang = ''
#越南语
try:
kVietnamese = Readings.find(text='kVietnamese').findNext('code').string
except :
kVietnamese = ''
#现代汉语词典
try:
kXHC1983 = Readings.find(text='kXHC1983').findNext('code').string
except :
kXHC1983 = ''
#变体
try:
Variants = blockquote.find(text='Variants').findNext('table')
except :
Variants = ''
#UnicodeData.txt
try:
kCompatibilityVariant = Variants.find(text='kCompatibilityVariant').findNext('a').string
except :
kCompatibilityVariant = ''
#异体字?
try:
kSemanticVariant = Variants.find(text='kSemanticVariant').findNext('a').string
except :
kSemanticVariant = ''
#简体
try:
kSimplifiedVariant = Variants.find(text='kSimplifiedVariant').findNext('a').string
except :
kSimplifiedVariant = ''
#变体
try:
kSpecializedSemanticVariant = Variants.find(text='kSpecializedSemanticVariant').findNext('a').string
except :
kSpecializedSemanticVariant = ''
#繁体
try:
kTraditionalVariant = Variants.find(text='kTraditionalVariant').findNext('a').string
except :
kTraditionalVariant = ''
#变体
try:
kZVariant = Variants.find(text='kZVariant').findNext('a').string
except :
kZVariant = ''
#词条结构
wikitext=u"__NOTC__\n{{-han-}}\n<big>\'\'\'-{"+hanzi+u'}-\'\'\'</big>\n\n'
#笔画
if kTotalStrokes:
wikitext=wikitext+u'* {{HanTS|'+wikipedia.html2unicode(kTotalStrokes)+u'}}\n'
#unicode部首
rlist=[u'一', u'丨', u'丶', u'丿', u'乙', u'亅', u'二', u'亠', u'人', u'儿', u'入', u'八', u'冂', u'冖', u'冫', u'几', u'凵', u'刀', u'力', u'勹', u'匕', u'匚', u'匸', u'十', u'卜', u'卩', u'厂', u'厶', u'又', u'口', u'囗', u'土', u'士', u'夂', u'夊', u'夕', u'大', u'女', u'子', u'宀', u'寸', u'小', u'尢', u'尸', u'屮', u'山', u'巛', u'工', u'己', u'巾', u'干', u'幺', u'广', u'廴', u'廿', u'弋', u'弓', u'彐', u'彡', u'彳', u'心', u'戈', u'戶', u'手', u'支', u'攴', u'文', u'斗', u'斤', u'方', u'无', u'日', u'曰', u'月', u'木', u'欠', u'止', u'歹', u'殳', u'毋', u'比', u'毛', u'氏', u'气', u'水', u'火', u'爪', u'父', u'爻', u'爿', u'片', u'牙', u'牛', u'犬', u'玄', u'玉', u'瓜', u'瓦', u'甘', u'生', u'用', u'田', u'疋', u'疒', u'癶', u'白', u'皮', u'皿', u'目', u'矛', u'矢', u'石', u'示', u'禸', u'禾', u'穴', u'立', u'竹', u'米', u'糸', u'缶', u'网', u'羊', u'羽', u'老', u'而', u'耒', u'耳', u'聿', u'肉', u'臣', u'自', u'至', u'臼', u'舌', u'舛', u'舟', u'艮', u'色', u'艸', u'虍', u'虫', u'血', u'行', u'衣', u'西', u'見', u'角', u'言', u'谷', u'豆', u'豕', u'豸', u'貝', u'赤', u'走', u'足', u'身', u'車', u'辛', u'辰', u'辵', u'邑', u'酉', u'釆', u'里', u'金', u'長', u'門', u'阜', u'隶', u'隹', u'雨', u'靑', u'非', u'面', u'革', u'韋', u'韭', u'音', u'頁', u'風', u'飛', u'食', u'首', u'香', u'馬', u'骨', u'高', u'髟', u'鬥', u'鬯', u'鬲', u'鬼', u'魚', u'鳥', u'鹵', u'鹿', u'麥', u'麻', u'黃', u'黍', u'黑', u'黹', u'黽', u'鼎', u'鼓', u'鼠', u'鼻', u'齊', u'齒', u'龍', u'龜', u'龠']
if kRSUnicode:
bushou=u"* {{部首|%s|%s}}\n" % (rlist[int(kRSUnicode.split(".")[0])-1], wikipedia.html2unicode(kRSUnicode.split(".")[1]))
wikitext=wikitext+bushou
#各种变体
bianti = []
yiti = []
if kSimplifiedVariant:
wikitext=wikitext+u'* [[w:简体|]]:-{[['+ wikipedia.html2unicode(kSimplifiedVariant[7:])+u']]}-\n'
elif kTraditionalVariant:
wikitext=wikitext+u'* [[w:繁体|]]:-{[['+ wikipedia.html2unicode(kTraditionalVariant[7:])+u']]}-\n'
if kSemanticVariant:
yiti.append(u'[['+wikipedia.html2unicode(kSemanticVariant[7:])+u']]')
if kSpecializedSemanticVariant:
yiti.append(u'[['+wikipedia.html2unicode(kSpecializedSemanticVariant[7:])+u']]')
if kCompatibilityVariant:
bianti.append(u'[['+wikipedia.html2unicode(kCompatibilityVariant[7:])+u']]')
if kZVariant:
bianti.append(u'[['+wikipedia.html2unicode(kZVariant[7:])+u']]')
if yiti <> [] and len(remove_dups(yiti))>0:
yititext=u','.join(remove_dups(yiti))
wikitext= wikitext+u'* [[w:异体字|]]:-{'+ yititext+u'}-\n'
if bianti <> [] and len(remove_dups(bianti))>0:
biantitext=u','.join(remove_dups(bianti))
wikitext= wikitext+u'* 其他变体:-{'+ biantitext+u'}-\n'
#数字
if kAccountingNumeric:
wikitext=wikitext+u'* [[Wiktionary:中文数字|数字]]:'+wikipedia.html2unicode(kAccountingNumeric)+u'\n'
elif kOtherNumeric:
wikitext=wikitext+u'* [[Wiktionary:中文数字|数字]]:'+wikipedia.html2unicode(kOtherNumeric)+u'\n'
elif kPrimaryNumeric:
wikitext=wikitext+u'* [[Wiktionary:中文数字|数字]]:'+wikipedia.html2unicode(kPrimaryNumeric)+u'\n'
#字典信息
wikitext= wikitext+u'=== 参考 ===\n'
if kDaeJaweon:
if int(kDaeJaweon.split(".")[1][2]) == 0:
DaeJaweon=u'* {{DaeJaweon|%s|%s}}\n' %(wikipedia.html2unicode(str(int(kDaeJaweon.split(".")[0]))), wikipedia.html2unicode(str(int(kDaeJaweon.split(".")[1][:2]))))
wikitext=wikitext+DaeJaweon
if kIRGHanyuDaZidian and int(kIRGHanyuDaZidian.split(".")[1][2]) == 0:
HanYu = u'* {{HanYu|%s|%s|%s}}\n' %(wikipedia.html2unicode(str(int(kIRGHanyuDaZidian.split(".")[0][0]))), wikipedia.html2unicode(str(int(kIRGHanyuDaZidian.split(".")[0][1:]))), wikipedia.html2unicode(str(int(kIRGHanyuDaZidian.split(".")[1][:2]))))
wikitext=wikitext+HanYu
if kKangXi and int(kKangXi.split(".")[1][2]) == 0:
KangXi = u'* {{KangXi|%s|%s|%s}}\n' %(kKangXi.split(".")[0],wikipedia.html2unicode(str(int(kKangXi.split(".")[0]))), wikipedia.html2unicode(str(int(kKangXi.split(".")[1][:2]))))
wikitext=wikitext+KangXi
if kSBGY:
try:
SBGY = u'* {{SBGY|%s|%s}}\n' %(wikipedia.html2unicode(str(int(kSBGY.split(".")[0]))), wikipedia.html2unicode(str(int(kSBGY.split(".")[1]))))
wikitext=wikitext+SBGY
except :
pass
if kCihaiT:
try:
CihaiT = u'* {{CihaiT|%s|%s|%s}}\n' %(wikipedia.html2unicode(str(int(kCihaiT.split(".")[0]))), wikipedia.html2unicode(str(int(kCihaiT.split(".")[1][0]))), wikipedia.html2unicode(str(int(kCihaiT.split(".")[1][1:]))))
wikitext=wikitext+CihaiT
except :
pass
wikitext=wikitext+u'* {{Unihanref|'+codepoint.upper()+u'}}\n'
#各种编码
wikitext= wikitext+u'=== 编码 ===\n'
if kCangjie:
Cangjie=u'* [[Wiktionary:仓颉索引|仓颉输入法]]:'+wikipedia.html2unicode(kCangjie)+u'\n'
wikitext= wikitext+Cangjie
if kFourCornerCode:
if '.' in kFourCornerCode:
FourCornerCode=u'* [[Wiktionary:四角号码索引|四角号码]]:'+wikipedia.html2unicode(kFourCornerCode.split(".")[0])+u'<sub>'+wikipedia.html2unicode(kFourCornerCode.split(".")[1])+u'</sub>\n'
wikitext= wikitext+FourCornerCode
else:
FourCornerCode=u'* [[Wiktionary:四角号码索引|四角号码]]:'+wikipedia.html2unicode(kFourCornerCode)+u'\n'
wikitext= wikitext+FourCornerCode
wikitext=wikitext+u'* [[w:Unicode|]]编码:\n'
wikitext=wikitext+u'**十进制:'+ wikipedia.html2unicode(Decimal)+u'\n'
wikitext=wikitext+u'**UTF-8:'+ wikipedia.html2unicode(UTF8)+u'\n'
wikitext=wikitext+u'**UTF-16:'+ wikipedia.html2unicode(UTF16)+u'\n'
wikitext=wikitext+u'**UTF-32:'+ wikipedia.html2unicode(UTF32)+u'\n'
if kBigFive:
BigFive=u'* [[w:Big 5|]]:'+wikipedia.html2unicode(kBigFive)+u'\n'
wikitext= wikitext+BigFive
if kCCCII:
CCCII=u'* [[w:CCCII|]]:'+wikipedia.html2unicode(kCCCII)+u'\n'
wikitext= wikitext+CCCII
if kCNS1986:
CNS1986=u'* [[w:中文標準交換碼|CNS 11643-1986]]:'+wikipedia.html2unicode(kCNS1986)+u'\n'
wikitext= wikitext+CNS1986
if kCNS1992:
CNS1992=u'* [[w:中文標準交換碼|CNS 11643-1992]]:'+wikipedia.html2unicode(kCNS1992)+u'\n'
wikitext= wikitext+CNS1992
if kEACC:
EACC=u'* EACC:'+wikipedia.html2unicode(kEACC)+u'\n'
wikitext= wikitext+EACC
if kGB0:
GB0=u'* [[w:GB 2312|GB 2312-80]]:'+wikipedia.html2unicode(kGB0)+u'\n'
wikitext= wikitext+GB0
if kGB1:
GB1=u'* [[w:GB 12345|GB 12345-90]]:'+wikipedia.html2unicode(kGB1)+u'\n'
wikitext= wikitext+GB1
if kGB3:
GB3=u'* [[w:GB 7589|GB 7589-87]]:'+wikipedia.html2unicode(kGB3)+u'\n'
wikitext= wikitext+GB3
if kGB5:
GB5=u'* [[w:GB 7590|GB 7590-87]]:'+wikipedia.html2unicode(kGB5)+u'\n'
wikitext= wikitext+GB5
if kGB7:
GB7=u'* [[w:GB 8565|GB 8565-89]]:'+wikipedia.html2unicode(kGB7)+u'\n'
wikitext= wikitext+GB7
if kGB8:
GB8=u'* [[w:GB 8565|GB 8565-89]]:'+wikipedia.html2unicode(kGB8)+u'\n'
wikitext= wikitext+GB8
if kHKSCS:
HKSCS=u'* [[w:香港增補字符集|HKSCS]]:'+wikipedia.html2unicode(kHKSCS)+u'\n'
wikitext= wikitext+HKSCS
if kJis0:
Jis0=u'* [[w:JIS X 0208|JIS X 0208-1990]]:'+wikipedia.html2unicode(kJis0)+u'\n'
wikitext= wikitext+Jis0
if kJis1:
Jis1=u'* [[w:JIS X 0212|JIS X 0212-1990]]:'+wikipedia.html2unicode(kJis1)+u'\n'
wikitext= wikitext+Jis1
if kJIS0213:
JIS0213=u'* [[w:JIS X 0213|JIS X 0213-2000]]:'+wikipedia.html2unicode(kJIS0213)+u'\n'
wikitext= wikitext+JIS0213
if kKPS0:
KPS0=u'* [[w:KPS 9566|KPS 9566-97]]:'+wikipedia.html2unicode(kKPS0)+u'\n'
wikitext= wikitext+KPS0
if kKPS1:
KPS1=u'* [[w:KPS 10721|KPS 10721-2000]]:'+wikipedia.html2unicode(kKPS1)+u'\n'
wikitext= wikitext+KPS1
if kKSC0:
KSC0=u'* [[w:KS X 1001|KS X 1001:1992]]:'+wikipedia.html2unicode(kKSC0)+u'\n'
wikitext= wikitext+KSC0
if kKSC1:
KSC1=u'* [[w:KS X 1002|KS X 1002:1991]]:'+wikipedia.html2unicode(kKSC1)+u'\n'
wikitext= wikitext+KSC1
if kPseudoGB1:
PseudoGB1=u'* [[w:GB 12345|伪GB码]]:'+wikipedia.html2unicode(kPseudoGB1)+u'\n'
wikitext= wikitext+PseudoGB1
if kMainlandTelegraph or kTaiwanTelegraph:
wikitext= wikitext+u'* [[w:中文电码|]]:\n'
if kMainlandTelegraph:
MainlandTelegraph=u'** 中国大陆:'+wikipedia.html2unicode(kMainlandTelegraph)+u'\n'
wikitext= wikitext+MainlandTelegraph
if kTaiwanTelegraph:
TaiwanTelegraph=u'** 台湾:'+wikipedia.html2unicode(kTaiwanTelegraph)+u'\n'
wikitext= wikitext+TaiwanTelegraph
#定义
zht=''
jpt=''
if kDefinition:
ds=kDefinition.split("; ")
deflist=[]
jdeflist=[]
for d in ds:
dlist=[]
clist=[]
jlist=[]
cant=None
if '(Cant.)' not in d and '(J)' not in d:
defs=d.split(', ')
for ddef in defs:
if ' ' not in ddef:
ddef=u'[['+wikipedia.html2unicode(ddef)+u']]'
dlist.append(ddef)
Definition=u','.join(dlist)
if Definition:
deflist.append(Definition)
if '(Cant.)' in d:
cants=d[8:].split(', ')
for c in cants:
if ' ' not in c:
cc=u'[['+wikipedia.html2unicode(c)+u']]'
clist.append(cc)
if clist:
cant= u'【粤语】'+u','.join(clist)
if cant:
deflist.append(cant)
if '(J)' in d:
js=d[4:].split(', ')
for j in js:
if ' ' not in j:
jj=u'[['+wikipedia.html2unicode(j)+u']]'
jlist.append(jj)
jp= u','.join(jlist)
if jp:
jdeflist.append(jp)
zht=u';'.join(deflist)
jpt=u';'.join(jdeflist)
zhtran=u'{{-trans-}}\n{{transl|\n* {{en}}:'+zht+u'\n}}\n'
jptran=u'{{-trans-}}\n{{transl|\n* {{en}}:'+jpt+u'\n}}\n'
cedict=open('cedict_ts.u8')
lines = cedict.readlines()
cedict.close()
linelist=[]
linenlist=[]
for line in lines:
if hanzi in line:
l=line.split(u' ',2)
if len(l[0])>1:
if l[0][0]==hanzi or l[1][0] == hanzi:
linelist.append(l)
elif l[0][-1]==hanzi or l[1][-1] == hanzi:
linenlist.append(l)
#词
wordz=''
wordn=''
if linelist:
for li in linelist:
if li[0] ==li[1]:
wordz = wordz + u'* -{[['+wikipedia.html2unicode(li[0])+u']]}-\n'
else:
if hanzi in li[0]:
wordz = wordz + u'* -{[['+wikipedia.html2unicode(li[0])+u']]}-\n'
if hanzi in li[1]:
wordz = wordz+ u'* -{[['+wikipedia.html2unicode(li[1])+u']]}-\n'
if linenlist:
for li in linenlist:
if li[0] ==li[1]:
wordn = wordn + u'* -{[['+wikipedia.html2unicode(li[0])+u']]}-\n'
else:
if hanzi in li[0]:
wordn = wordn + u'* -{[['+wikipedia.html2unicode(li[0])+u']]}-\n'
if hanzi in li[1]:
wordn = wordn+ u'* -{[['+wikipedia.html2unicode(li[1])+u']]}-\n'
if kCantonese or kHanyuPinyin or kTang or zht or wordz or wordn:
wikitext=wikitext+u'{{-zh-}}\n[[Category:{{subst:zh}}|]]\n<big>-{\'\'\''+hanzi+u'\'\'\'}-</big>\n'
zhwppg=wikipedia.Page(sitezh, hanzi)
if zhwppg.exists():
wikitext=wikitext+u'{{wikipedia}}\n'
if kCantonese or kHanyuPinyin or kTang:
wikitext=wikitext+u'{{-pron-}}\n'
#读音
if kHanyuPinyin:
HanyuPinyin=u'* [[Wiktionary:汉语拼音索引|汉语拼音]]:'+wikipedia.html2unicode(kHanyuPinyin.split(":")[1])+u'\n'
wikitext= wikitext+HanyuPinyin
if kCantonese:
Cantonese=u'* [[w:粵拼|]]:'+wikipedia.html2unicode(kCantonese)+u'\n'
wikitext= wikitext+Cantonese
if kTang:
if '*' in kTang:
Tang=u'* 唐音:'+wikipedia.html2unicode(kTang[1:])+u'\n'
wikitext= wikitext+Tang
else:
Tang=u'* 唐音:'+wikipedia.html2unicode(kTang)+u'\n'
wikitext= wikitext+Tang
if zht:
wikitext=wikitext+zhtran
if linelist or linenlist:
wikitext=wikitext+u'=== 组词 ===\n'
if linelist:
wikitext=wikitext+wordz
if linenlist:
wikitext=wikitext+u';逆序\n'+wordn
if jpt or kJapaneseKun or kJapaneseOn:
wikitext=wikitext+u'{{-ja-}}\n[[Category:{{subst:ja}}|]]\n<big>\'\'\'{{lang|ja|'+hanzi+u'}}\'\'\'</big>\n'
jawppg=wikipedia.Page(siteja, hanzi)
if jawppg.exists():
wikitext=wikitext+u'{{wikipedia|lang=ja}}\n'
if kJapaneseKun or kJapaneseOn:
wikitext=wikitext+u'{{-pron-}}\n'
if kJapaneseKun:
JapaneseKun=u'* [[w:訓讀|]]:'+wikipedia.html2unicode(kJapaneseKun.lower())+u'\n'
wikitext= wikitext+JapaneseKun
if kJapaneseOn:
JapaneseOn=u'* [[w:音讀|]]:'+wikipedia.html2unicode(kJapaneseOn.lower())+u'\n'
wikitext= wikitext+JapaneseOn
if jpt:
wikitext=wikitext+jptran
if kHangul or kKorean:
wikitext=wikitext+u'{{-ko-}}\n[[Category:{{subst:ko}}|]]\n<big>\'\'\'{{lang|ko|'+hanzi+u'}}\'\'\'</big>\n'+u'{{-pron-}}\n'
kowppg=wikipedia.Page(siteko, hanzi)
if kowppg.exists():
wikitext=wikitext+u'{{wikipedia|lang=ko}}\n'
if kHangul:
Hangul=u'* [[Wiktionary:諺文索引|諺文]]:[['+wikipedia.html2unicode(kHangul)+u']]\n'
wikitext= wikitext+Hangul
if kKorean:
Korean=u'* 耶鲁式:'+wikipedia.html2unicode(kKorean.lower())+u'\n'
wikitext= wikitext+Korean
if kVietnamese:
wikitext=wikitext+u'{{-vi-}}\n[[Category:{{subst:vi}}|]]\n<big>\'\'\'{{lang|vi|'+hanzi+u'}}\'\'\'</big>\n'+u'{{-pron-}}\n'
viwppg=wikipedia.Page(sitevi, hanzi)
if viwppg.exists():
wikitext=wikitext+u'{{wikipedia|lang=vi}}\n'
Vietnamese=u'* [['+wikipedia.html2unicode(kVietnamese)+u']]\n'
wikitext= wikitext+Vietnamese
wikitext=wikitext+u'[[en:'+hanzi+u']]\n'+u'[[ja:'+hanzi+u']]\n'
pg=wikipedia.Page(site, hanzi)
if pg.exists() and not pg.isRedirectPage():
pg=wikipedia.Page(site, u'User:Sz-iwbot/Unihan/%s' % hanzi)
pg.put(wikitext,u"[[%s]]已存在,临时存放等待合并" % hanzi)
wikipedia.output(u'[[%s]]已存在,临时存放[[user:Sz-iwbot/Unihan/%s]]等待合并' % (hanzi, hanzi))
else:
pg.put(wikitext,u"增加\"%s\"的unihan数据" % hanzi)
wikipedia.output(u'增加[[%s]]的unihan数据' % hanzi)
log = open('unihan.dat','w')
log.write(str(x))
log.close()
randomsleep=random.randint(1,100)
wikipedia.output('sleep %ss' % str(randomsleep))
time.sleep(randomsleep)
wikipedia.stopme()