Nothing Special   »   [go: up one dir, main page]

跳转到内容

User:Sz-iwbot/hanzi.py

維基詞典,自由的多語言詞典
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from BeautifulSoup import BeautifulSoup          # For processing HTML
import sys, os, chardet
import wikipedia
import urllib2,time,random, urllib
#修正韩文编码问题
reload(sys)
sys.setdefaultencoding('utf-8')

site = wikipedia.getSite()
sitezh = wikipedia.getSite(code='zh',fam='wikipedia')
siteja = wikipedia.getSite(code='ja',fam='wikipedia')
siteko = wikipedia.getSite(code='ko',fam='wikipedia')
sitevi = wikipedia.getSite(code='vi',fam='wikipedia')

def remove_dups(lst):
    """ Removes duplicate elements from list. Drawbacks:
        - Returns an unsorted list. 
        - Does not work with lists, dicts etc. as list elements.
    """
    dick = {}
    for item in lst:
        dick[item] = None
    return dick.keys()

def GetPage(codepoint):
    """获取网页正文"""
    baseurl = u'http://www.unicode.org/cgi-bin/GetUnihanData.pl?codepoint=%s'
    url = baseurl % codepoint
    u = urllib2.urlopen(url)
    return u.read()

try:
    try:
        log=open('unihan.dat')
        unibegin=int(log.read())
    finally:
        log.close() 
except:
    unibegin=int(0x4E00)  

uniend=int(0x9FFF+1)

for x in range(unibegin, uniend):
    codepoint=hex(x)[2:]
#获取网页内容
    html=GetPage(codepoint)
#print isinstance(html, unicode)
    utfcheck= chardet.detect(html)['encoding']

#格式化网页
    soup = BeautifulSoup(html)
#print isinstance(str(soup), unicode), type(str(soup)),isinstance(str(soup), str), chardet.detect(str(soup))
#soup = soup.__str__("utf-8")


#只得到需要的部分
    blockquote = soup.blockquote

#该unicode的汉字
    if utfcheck == 'utf-8':
        hanzi= blockquote.find(text='Glyphs').findNext('table').font.contents[0]
    else:
        hanzi=wikipedia.html2unicode('&#x'+codepoint+';')
    print hanzi


#各种编码
    Encoding = blockquote.find(text='Encoding Forms').findNext('table').findAll('td')
    Decimal = Encoding[0].string
    UTF8 = Encoding[1].string
    UTF16 = Encoding[2].string
    UTF32 = Encoding[3].string

#各种IRG(表意文字小組)
    try:
        IRG = blockquote.find(text='IRG Sources').findNext('table')
    except :
        IRG=''
#國際表意文字核心
    try:
        kIICore = IRG.find(text='kIICore').findNext('code').string
    except :
        kIICore = ''
#国标来源? GB 2312等(16进制?)
    try:
        kIRG_GSource = IRG.find(text='kIRG_GSource').findNext('code').string
    except :
        kIRG_GSource = ''
#香港增補字符集(16进制?)
    try:
        kIRG_HSource = IRG.find(text='kIRG_HSource').findNext('code').string
    except :
        kIRG_HSource = ''
#日本的多个来源(16进制?)
    try:
        kIRG_JSource = IRG.find(text='kIRG_JSource').findNext('code').string
    except :
        kIRG_JSource = ''
#朝鲜
    try:
        kIRG_KPSource = IRG.find(text='kIRG_KPSource').findNext('code').string
    except :
        kIRG_KPSource = ''
#韩国的
    try:
        kIRG_KSource = IRG.find(text='kIRG_KSource').findNext('code').string
    except :
        kIRG_KSource = ''
#台湾的
    try:
        kIRG_TSource = IRG.find(text='kIRG_TSource').findNext('code').string
    except :
        kIRG_TSource = ''
# http://unicode.org/reports/tr45/#UAX38
    try:
        kIRG_USource = IRG.find(text='kIRG_USource').findNext('code').string
    except :
        kIRG_USource = ''
#越南的
    try:
        kIRG_VSource = IRG.find(text='kIRG_VSource').findNext('code').string
    except :
        kIRG_VSource = ''
#澳門資訊系統字集
    try:
        kIRG_MSource = IRG.find(text='kIRG_MSource').findNext('code').string
    except :
        kIRG_MSource = ''


#各种字典
    try:
        Dictionary = blockquote.find(text='Dictionary Indices').findNext('table')
    except :
        Dictionary = ''
#以汉字写粤语 中国语言学报专著系列
    try:
        kCheungBauerIndex = Dictionary.find(text='kCheungBauerIndex').findNext('code').string
    except :
        kCheungBauerIndex = ''
#廣州話袖珍字典
    try:
        kCowles = Dictionary.find(text='kCowles').findNext('code').string
    except :
        kCowles = ''
#Dae Jaweon (Korean) dictionary(大字源?)
    try:
        kDaeJaweon = Dictionary.find(text='kDaeJaweon').findNext('code').string
    except :
        kDaeJaweon = ''
#Fenn's Chinese-English Pocket Dictionary《五千词典》
    try:
        kFennIndex = Dictionary.find(text='kFennIndex').findNext('code').string
    except :
        kFennIndex = ''
#漢文典
    try:
        kGSR = Dictionary.find(text='kGSR').findNext('code').string
    except :
        kGSR = ''
#漢語大字典
    try:
        kHanYu = Dictionary.find(text='kHanYu').findNext('code').string
    except :
        kHanYu = ''
#大字源
    try:
        kIRGDaeJaweon = Dictionary.find(text='kIRGDaeJaweon').findNext('code').string
    except :
        kIRGDaeJaweon = ''
#大汉和辞典
    try:
        kIRGDaiKanwaZiten = Dictionary.find(text='kIRGDaiKanwaZiten').findNext('code').string
    except :
        kIRGDaiKanwaZiten = ''
#汉语大字典
    try:
        kIRGHanyuDaZidian = Dictionary.find(text='kIRGHanyuDaZidian').findNext('code').string
    except :
        kIRGHanyuDaZidian = ''
#康熙字典
    try:
        kIRGKangXi = Dictionary.find(text='kIRGKangXi').findNext('code').string
    except :
        kIRGKangXi = ''
#康熙字典
    try:
        kKangXi = Dictionary.find(text='kKangXi').findNext('code').string
    except :
        kKangXi = ''
#中日汉字分析字典
    try:
        kKarlgren = Dictionary.find(text='kKarlgren').findNext('code').string
    except :
        kKarlgren = ''
#實用粵英辭典
    try:
        kLau = Dictionary.find(text='kLau').findNext('code').string
    except :
        kLau = ''
#麥氏漢英辭典
    try:
        kMatthews = Dictionary.find(text='kMatthews').findNext('code').string
    except :
        kMatthews = ''
#学生用粤英字典
    try:
        kMeyerWempe = Dictionary.find(text='kMeyerWempe').findNext('code').string
    except :
        kMeyerWempe = ''
#大汉和辞典
    try:
        kMorohashi = Dictionary.find(text='kMorohashi').findNext('code').string
    except :
        kMorohashi = ''
#最新漢英辞典
    try:
        kNelson = Dictionary.find(text='kNelson').findNext('code').string
    except :
        kNelson = ''
#宋本廣韻
    try:
        kSBGY = Dictionary.find(text='kSBGY').findNext('code').string
    except :
        kSBGY = ''

#其他汉字检索信息
    try:
        DictionaryLikeData = blockquote.find(text='Dictionary-like Data').findNext('table')
    except :
        DictionaryLikeData = ''
#仓颉输入法
    try:
        kCangjie = DictionaryLikeData.find(text='kCangjie').findNext('code').string
    except :
        kCangjie = ''
#以汉字写粤语 中国语言学报专著系列
    try:
        kCheungBauer = DictionaryLikeData.find(text='kCheungBauer').findNext('code').string
    except :
        kCheungBauer = ''
#辭海
    try:
        kCihaiT = DictionaryLikeData.find(text='kCihaiT').findNext('code').string
    except :
        kCihaiT = ''
#《五千词典》
    try:
        kFenn = DictionaryLikeData.find(text='kFenn').findNext('code').string
    except :
        kFenn = ''
#四角号码
    try:
        kFourCornerCode = DictionaryLikeData.find(text='kFourCornerCode').findNext('code').string
    except :
        kFourCornerCode = ''
#繁体中文USENET帖子词频分析
    try:
        kFrequency = DictionaryLikeData.find(text='kFrequency').findNext('code').string
    except :
        kFrequency = ''
#朗文初級中文詞典
    try:
        kGradeLevel = DictionaryLikeData.find(text='kGradeLevel').findNext('code').string
    except :
        kGradeLevel = ''
#漢語大字典
    try:
        kHDZRadBreak = DictionaryLikeData.find(text='kHDZRadBreak').findNext('code').string
    except :
        kHDZRadBreak = ''
#常用字字形表 (二零零零年修訂本),香港: 香港教育學院, 2000, ISBN 962-949-040-4.
    try:
        kHKGlyph = DictionaryLikeData.find(text='kHKGlyph').findNext('code').string
    except :
        kHKGlyph = ''
#“Ten Thousand Characters: An Analytic Dictionary”
    try:
        kPhonetic = DictionaryLikeData.find(text='kPhonetic').findNext('code').string
    except :
        kPhonetic = ''
#笔画
    try:
        kTotalStrokes = DictionaryLikeData.find(text='kTotalStrokes').findNext('code').string
    except :
        kTotalStrokes = ''

#数字,只有一个
    try:
        NumericValues = blockquote.find(text='Numeric Values').findNext('table')
    except :
        NumericValues = ''
#大写数字/会计数字
    try:
        kAccountingNumeric = NumericValues.find(text='kAccountingNumeric').findNext('code').string
    except :
        kAccountingNumeric = ''
#特别的数字
    try:
        kOtherNumeric = NumericValues.find(text='kOtherNumeric').findNext('code').string
    except :
        kOtherNumeric = ''
#对应的数字
    try:
        kPrimaryNumeric = NumericValues.find(text='kPrimaryNumeric').findNext('code').string
    except :
        kPrimaryNumeric = ''
    print kAccountingNumeric, kOtherNumeric, kPrimaryNumeric

#编码
    try:
        OtherMappings = blockquote.find(text='Other Mappings').findNext('table')
    except :
        OtherMappings = ''
#Big5
    try:
        kBigFive = OtherMappings.find(text='kBigFive').findNext('code').string
    except :
        kBigFive = ''
#中文資訊交換碼(Chinese Character Code for Information Interchange,簡稱CCCII)
    try:
        kCCCII = OtherMappings.find(text='kCCCII').findNext('code').string
    except :
        kCCCII = ''
#台灣CNS11643-1986漢字標準(國家標準中文交換碼)
    try:
        kCNS1986 = OtherMappings.find(text='kCNS1986').findNext('code').string
    except :
        kCNS1986 = ''
#台灣CNS11643-1992漢字標準(國家標準中文交換碼)
    try:
        kCNS1992 = OtherMappings.find(text='kCNS1992').findNext('code').string
    except :
        kCNS1992 = ''
#东亚字码 EACC
    try:
        kEACC = OtherMappings.find(text='kEACC').findNext('code').string
    except :
        kEACC = ''
#GB 2312-80
    try:
        kGB0 = OtherMappings.find(text='kGB0').findNext('code').string
    except :
        kGB0 = ''
#GB 12345-90
    try:
        kGB1 = OtherMappings.find(text='kGB1').findNext('code').string
    except :
        kGB1 = ''
#GB 7589-87
    try:
        kGB3 = OtherMappings.find(text='kGB3').findNext('code').string
    except :
        kGB3 = ''
#GB 7590-87
    try:
        kGB5 = OtherMappings.find(text='kGB5').findNext('code').string
    except :
        kGB5 = ''
#GB 8565-89
    try:
        kGB7 = OtherMappings.find(text='kGB7').findNext('code').string
    except :
        kGB7 = ''
#GB 8565-89
    try:
        kGB8 = OtherMappings.find(text='kGB8').findNext('code').string
    except :
        kGB8 = ''
#香港增補字符集
    try:
        kHKSCS = OtherMappings.find(text='kHKSCS').findNext('code').string
    except :
        kHKSCS = ''
#IBM 日文编码
    try:
        kIBMJapan = OtherMappings.find(text='kIBMJapan').findNext('code').string
    except :
        kIBMJapan = ''
#JIS X 0208-1990 Code of the Japanese Graphic Character Set for Information Interchange
    try:
        kJis0 = OtherMappings.find(text='kJis0').findNext('code').string
    except :
        kJis0 = ''
#JIS X 0212-1990
    try:
        kJis1 = OtherMappings.find(text='kJis1').findNext('code').string
    except :
        kJis1 = ''
#JIS X 0213-2000
    try:
        kJIS0213 = OtherMappings.find(text='kJIS0213').findNext('code').string
    except :
        kJIS0213 = ''
#KPS 9566-97
    try:
        kKPS0 = OtherMappings.find(text='kKPS0').findNext('code').string
    except :
        kKPS0 = ''
#KPS 10721-2000
    try:
        kKPS1 = OtherMappings.find(text='kKPS1').findNext('code').string
    except :
        kKPS1 = ''
#KS X 1001:1992 (KS C 5601-1989)
    try:
        kKSC0 = OtherMappings.find(text='kKSC0').findNext('code').string
    except :
        kKSC0 = ''
#KS X 1002:1991 (KS C 5657-1991)
    try:
        kKSC1 = OtherMappings.find(text='kKSC1').findNext('code').string
    except :
        kKSC1 = ''
#PRC电报吗
    try:
        kMainlandTelegraph = OtherMappings.find(text='kMainlandTelegraph').findNext('code').string
    except :
        kMainlandTelegraph = ''
#伪GB 12345-90
    try:
        kPseudoGB1 = OtherMappings.find(text='kPseudoGB1').findNext('code').string
    except :
        kPseudoGB1 = ''
#台湾电报
    try:
        kTaiwanTelegraph = OtherMappings.find(text='kTaiwanTelegraph').findNext('code').string
    except :
        kTaiwanTelegraph = ''
#Xerox code
    try:
        kXerox = OtherMappings.find(text='kXerox').findNext('code').string
    except :
        kXerox = ''

#部首
    try:
        RadicalStrokeCounts = blockquote.find(text='Radical-stroke Indices').findNext('table')
    except :
        RadicalStrokeCounts = ''
#Adobe-Japan1-6
    try:
        kRSAdobe_Japan1_6 = RadicalStrokeCounts.find(text='kRSAdobe_Japan1_6').findNext('code').string
    except :
        kRSAdobe_Japan1_6 = ''
#日文部首
    try:
        kRSJapanese = RadicalStrokeCounts.find(text='kRSJapanese').findNext('code').string
    except :
        kRSJapanese = ''
#康熙字典
    try:
        kRSKangXi = RadicalStrokeCounts.find(text='kRSKangXi').findNext('code').string
    except :
        kRSKangXi = ''
#大漢和辞典
    try:
        kRSKanWa = RadicalStrokeCounts.find(text='kRSKanWa').findNext('code').string
    except :
        kRSKanWa = ''
#韩文部首
    try:
        kRSKorean = RadicalStrokeCounts.find(text='kRSKorean').findNext('code').string
    except :
        kRSKorean = ''
#Unicode部首
    try:
        kRSUnicode = RadicalStrokeCounts.find(text='kRSUnicode').findNext('code').string
    except :
        kRSUnicode = ''

#读音
    try:
        Readings = blockquote.find(text='Readings').findNext('table')
    except :
        Readings = ''
#香港語言學學會粵語拼音方案,簡稱粵拼
    try:
        kCantonese = Readings.find(text='kCantonese').findNext('code').string
    except :
        kCantonese = ''
#英语翻译
    try:
        kDefinition = Readings.find(text='kDefinition').findNext('code').string
    except :
        kDefinition = ''
#諺文
    try:
        kHangul = Readings.find(text='kHangul').findNext('code').string
    except :
        kHangul = ''
#汉语拼音与词频
    try:
        kHanyuPinlu = Readings.find(text='kHanyuPinlu').findNext('code').string
    except :
        kHanyuPinlu = ''
#漢語拼音
    try:
        kHanyuPinyin = Readings.find(text='kHanyuPinyin').findNext('code').string
    except :
        kHanyuPinyin = ''
#日文訓讀
    try:
        kJapaneseKun = Readings.find(text='kJapaneseKun').findNext('code').string
    except :
        kJapaneseKun = ''
#日文音讀
    try:
        kJapaneseOn = Readings.find(text='kJapaneseOn').findNext('code').string
    except :
        kJapaneseOn = ''
#韩语发音(耶鲁系统)
    try:
        kKorean = Readings.find(text='kKorean').findNext('code').string
    except :
        kKorean = ''
#普通话
    try:
        kMandarin = Readings.find(text='kMandarin').findNext('code').string
    except :
        kMandarin = ''
#唐朝时的读音(不是唐音)
    try:
        kTang = Readings.find(text='kTang').findNext('code').string
    except :
        kTang = ''
#越南语
    try:
        kVietnamese = Readings.find(text='kVietnamese').findNext('code').string
    except :
        kVietnamese = ''
#现代汉语词典
    try:
        kXHC1983 = Readings.find(text='kXHC1983').findNext('code').string
    except :
        kXHC1983 = ''

#变体
    try:
        Variants = blockquote.find(text='Variants').findNext('table')
    except :
        Variants = ''
#UnicodeData.txt
    try:
        kCompatibilityVariant = Variants.find(text='kCompatibilityVariant').findNext('a').string
    except :
        kCompatibilityVariant = ''
#异体字?
    try:
        kSemanticVariant = Variants.find(text='kSemanticVariant').findNext('a').string
    except :
        kSemanticVariant = ''
#简体
    try:
        kSimplifiedVariant = Variants.find(text='kSimplifiedVariant').findNext('a').string
    except :
        kSimplifiedVariant = ''
#变体
    try:
        kSpecializedSemanticVariant = Variants.find(text='kSpecializedSemanticVariant').findNext('a').string
    except :
        kSpecializedSemanticVariant = ''
#繁体
    try:
        kTraditionalVariant = Variants.find(text='kTraditionalVariant').findNext('a').string
    except :
        kTraditionalVariant = ''
#变体
    try:
        kZVariant = Variants.find(text='kZVariant').findNext('a').string
    except :
        kZVariant = ''

#词条结构
    wikitext=u"__NOTC__\n{{-han-}}\n<big>\'\'\'-{"+hanzi+u'}-\'\'\'</big>\n\n'

#笔画
    if kTotalStrokes:
        wikitext=wikitext+u'* {{HanTS|'+wikipedia.html2unicode(kTotalStrokes)+u'}}\n'

#unicode部首
    rlist=[u'一', u'丨', u'丶', u'丿', u'乙', u'亅', u'二', u'亠', u'人', u'儿', u'入', u'八', u'冂', u'冖', u'冫', u'几', u'凵', u'刀', u'力', u'勹', u'匕', u'匚', u'匸', u'十', u'卜', u'卩', u'厂', u'厶', u'又', u'口', u'囗', u'土', u'士', u'夂', u'夊', u'夕', u'大', u'女', u'子', u'宀', u'寸', u'小', u'尢', u'尸', u'屮', u'山', u'巛', u'工', u'己', u'巾', u'干', u'幺', u'广', u'廴', u'廿', u'弋', u'弓', u'彐', u'彡', u'彳', u'心', u'戈', u'戶', u'手', u'支', u'攴', u'文', u'斗', u'斤', u'方', u'无', u'日', u'曰', u'月', u'木', u'欠', u'止', u'歹', u'殳', u'毋', u'比', u'毛', u'氏', u'气', u'水', u'火', u'爪', u'父', u'爻', u'爿', u'片', u'牙', u'牛', u'犬', u'玄', u'玉', u'瓜', u'瓦', u'甘', u'生', u'用', u'田', u'疋', u'疒', u'癶', u'白', u'皮', u'皿', u'目', u'矛', u'矢', u'石', u'示', u'禸', u'禾', u'穴', u'立', u'竹', u'米', u'糸', u'缶', u'网', u'羊', u'羽', u'老', u'而', u'耒', u'耳', u'聿', u'肉', u'臣', u'自', u'至', u'臼', u'舌', u'舛', u'舟', u'艮', u'色', u'艸', u'虍', u'虫', u'血', u'行', u'衣', u'西', u'見', u'角', u'言', u'谷', u'豆', u'豕', u'豸', u'貝', u'赤', u'走', u'足', u'身', u'車', u'辛', u'辰', u'辵', u'邑', u'酉', u'釆', u'里', u'金', u'長', u'門', u'阜', u'隶', u'隹', u'雨', u'靑', u'非', u'面', u'革', u'韋', u'韭', u'音', u'頁', u'風', u'飛', u'食', u'首', u'香', u'馬', u'骨', u'高', u'髟', u'鬥', u'鬯', u'鬲', u'鬼', u'魚', u'鳥', u'鹵', u'鹿', u'麥', u'麻', u'黃', u'黍', u'黑', u'黹', u'黽', u'鼎', u'鼓', u'鼠', u'鼻', u'齊', u'齒', u'龍', u'龜', u'龠']
    if kRSUnicode:
        bushou=u"* {{部首|%s|%s}}\n" % (rlist[int(kRSUnicode.split(".")[0])-1], wikipedia.html2unicode(kRSUnicode.split(".")[1]))
        wikitext=wikitext+bushou

#各种变体
    bianti = []
    yiti = []
    if kSimplifiedVariant:
        wikitext=wikitext+u'* [[w:简体|]]:-{[['+ wikipedia.html2unicode(kSimplifiedVariant[7:])+u']]}-\n'
    elif kTraditionalVariant:
        wikitext=wikitext+u'* [[w:繁体|]]:-{[['+ wikipedia.html2unicode(kTraditionalVariant[7:])+u']]}-\n'
    if kSemanticVariant:
         yiti.append(u'[['+wikipedia.html2unicode(kSemanticVariant[7:])+u']]')
    if kSpecializedSemanticVariant:
        yiti.append(u'[['+wikipedia.html2unicode(kSpecializedSemanticVariant[7:])+u']]')
    if kCompatibilityVariant:
        bianti.append(u'[['+wikipedia.html2unicode(kCompatibilityVariant[7:])+u']]')
    if kZVariant:
        bianti.append(u'[['+wikipedia.html2unicode(kZVariant[7:])+u']]')
    if yiti <> [] and len(remove_dups(yiti))>0:
        yititext=u','.join(remove_dups(yiti))
        wikitext= wikitext+u'* [[w:异体字|]]:-{'+ yititext+u'}-\n'
    if bianti <> [] and len(remove_dups(bianti))>0:
        biantitext=u','.join(remove_dups(bianti))
        wikitext= wikitext+u'* 其他变体:-{'+ biantitext+u'}-\n'

#数字
    if kAccountingNumeric:
        wikitext=wikitext+u'* [[Wiktionary:中文数字|数字]]:'+wikipedia.html2unicode(kAccountingNumeric)+u'\n'
    elif kOtherNumeric:
        wikitext=wikitext+u'* [[Wiktionary:中文数字|数字]]:'+wikipedia.html2unicode(kOtherNumeric)+u'\n'
    elif kPrimaryNumeric:
        wikitext=wikitext+u'* [[Wiktionary:中文数字|数字]]:'+wikipedia.html2unicode(kPrimaryNumeric)+u'\n'

#字典信息
    wikitext= wikitext+u'=== 参考 ===\n'
    if kDaeJaweon:
        if int(kDaeJaweon.split(".")[1][2]) == 0:
            DaeJaweon=u'* {{DaeJaweon|%s|%s}}\n' %(wikipedia.html2unicode(str(int(kDaeJaweon.split(".")[0]))), wikipedia.html2unicode(str(int(kDaeJaweon.split(".")[1][:2]))))
            wikitext=wikitext+DaeJaweon
    if kIRGHanyuDaZidian and int(kIRGHanyuDaZidian.split(".")[1][2]) == 0:
        HanYu = u'* {{HanYu|%s|%s|%s}}\n' %(wikipedia.html2unicode(str(int(kIRGHanyuDaZidian.split(".")[0][0]))), wikipedia.html2unicode(str(int(kIRGHanyuDaZidian.split(".")[0][1:]))), wikipedia.html2unicode(str(int(kIRGHanyuDaZidian.split(".")[1][:2]))))
        wikitext=wikitext+HanYu
    if kKangXi and int(kKangXi.split(".")[1][2]) == 0:
        KangXi = u'* {{KangXi|%s|%s|%s}}\n' %(kKangXi.split(".")[0],wikipedia.html2unicode(str(int(kKangXi.split(".")[0]))), wikipedia.html2unicode(str(int(kKangXi.split(".")[1][:2]))))
        wikitext=wikitext+KangXi
    if kSBGY:
        try:
            SBGY = u'* {{SBGY|%s|%s}}\n' %(wikipedia.html2unicode(str(int(kSBGY.split(".")[0]))), wikipedia.html2unicode(str(int(kSBGY.split(".")[1]))))
            wikitext=wikitext+SBGY
        except :
            pass
    if kCihaiT:
        try:
            CihaiT = u'* {{CihaiT|%s|%s|%s}}\n' %(wikipedia.html2unicode(str(int(kCihaiT.split(".")[0]))), wikipedia.html2unicode(str(int(kCihaiT.split(".")[1][0]))), wikipedia.html2unicode(str(int(kCihaiT.split(".")[1][1:]))))
            wikitext=wikitext+CihaiT
        except :
            pass            

    wikitext=wikitext+u'* {{Unihanref|'+codepoint.upper()+u'}}\n'

#各种编码

    wikitext= wikitext+u'=== 编码 ===\n'
    if kCangjie:
        Cangjie=u'* [[Wiktionary:仓颉索引|仓颉输入法]]:'+wikipedia.html2unicode(kCangjie)+u'\n'
        wikitext= wikitext+Cangjie
    if kFourCornerCode:
        if '.' in kFourCornerCode:
            FourCornerCode=u'* [[Wiktionary:四角号码索引|四角号码]]:'+wikipedia.html2unicode(kFourCornerCode.split(".")[0])+u'<sub>'+wikipedia.html2unicode(kFourCornerCode.split(".")[1])+u'</sub>\n'
            wikitext= wikitext+FourCornerCode
        else:
            FourCornerCode=u'* [[Wiktionary:四角号码索引|四角号码]]:'+wikipedia.html2unicode(kFourCornerCode)+u'\n'
            wikitext= wikitext+FourCornerCode        
    wikitext=wikitext+u'* [[w:Unicode|]]编码:\n'
    wikitext=wikitext+u'**十进制:'+ wikipedia.html2unicode(Decimal)+u'\n'
    wikitext=wikitext+u'**UTF-8:'+ wikipedia.html2unicode(UTF8)+u'\n'
    wikitext=wikitext+u'**UTF-16:'+ wikipedia.html2unicode(UTF16)+u'\n'
    wikitext=wikitext+u'**UTF-32:'+ wikipedia.html2unicode(UTF32)+u'\n'
    if kBigFive:
        BigFive=u'* [[w:Big 5|]]:'+wikipedia.html2unicode(kBigFive)+u'\n'
        wikitext= wikitext+BigFive
    if kCCCII:
        CCCII=u'* [[w:CCCII|]]:'+wikipedia.html2unicode(kCCCII)+u'\n'
        wikitext= wikitext+CCCII
    if kCNS1986:
        CNS1986=u'* [[w:中文標準交換碼|CNS 11643-1986]]:'+wikipedia.html2unicode(kCNS1986)+u'\n'
        wikitext= wikitext+CNS1986
    if kCNS1992:
        CNS1992=u'* [[w:中文標準交換碼|CNS 11643-1992]]:'+wikipedia.html2unicode(kCNS1992)+u'\n'
        wikitext= wikitext+CNS1992
    if kEACC:
        EACC=u'* EACC:'+wikipedia.html2unicode(kEACC)+u'\n'
        wikitext= wikitext+EACC
    if kGB0:
        GB0=u'* [[w:GB 2312|GB 2312-80]]:'+wikipedia.html2unicode(kGB0)+u'\n'
        wikitext= wikitext+GB0
    if kGB1:
        GB1=u'* [[w:GB 12345|GB 12345-90]]:'+wikipedia.html2unicode(kGB1)+u'\n'
        wikitext= wikitext+GB1
    if kGB3:
        GB3=u'* [[w:GB 7589|GB 7589-87]]:'+wikipedia.html2unicode(kGB3)+u'\n'
        wikitext= wikitext+GB3
    if kGB5:
        GB5=u'* [[w:GB 7590|GB 7590-87]]:'+wikipedia.html2unicode(kGB5)+u'\n'
        wikitext= wikitext+GB5
    if kGB7:
        GB7=u'* [[w:GB 8565|GB 8565-89]]:'+wikipedia.html2unicode(kGB7)+u'\n'
        wikitext= wikitext+GB7
    if kGB8:
        GB8=u'* [[w:GB 8565|GB 8565-89]]:'+wikipedia.html2unicode(kGB8)+u'\n'
        wikitext= wikitext+GB8
    if kHKSCS:
        HKSCS=u'* [[w:香港增補字符集|HKSCS]]:'+wikipedia.html2unicode(kHKSCS)+u'\n'
        wikitext= wikitext+HKSCS
    if kJis0:
        Jis0=u'* [[w:JIS X 0208|JIS X 0208-1990]]:'+wikipedia.html2unicode(kJis0)+u'\n'
        wikitext= wikitext+Jis0
    if kJis1:
        Jis1=u'* [[w:JIS X 0212|JIS X 0212-1990]]:'+wikipedia.html2unicode(kJis1)+u'\n'
        wikitext= wikitext+Jis1
    if kJIS0213:
        JIS0213=u'* [[w:JIS X 0213|JIS X 0213-2000]]:'+wikipedia.html2unicode(kJIS0213)+u'\n'
        wikitext= wikitext+JIS0213
    if kKPS0:
        KPS0=u'* [[w:KPS 9566|KPS 9566-97]]:'+wikipedia.html2unicode(kKPS0)+u'\n'
        wikitext= wikitext+KPS0
    if kKPS1:
        KPS1=u'* [[w:KPS 10721|KPS 10721-2000]]:'+wikipedia.html2unicode(kKPS1)+u'\n'
        wikitext= wikitext+KPS1
    if kKSC0:
        KSC0=u'* [[w:KS X 1001|KS X 1001:1992]]:'+wikipedia.html2unicode(kKSC0)+u'\n'
        wikitext= wikitext+KSC0
    if kKSC1:
        KSC1=u'* [[w:KS X 1002|KS X 1002:1991]]:'+wikipedia.html2unicode(kKSC1)+u'\n'
        wikitext= wikitext+KSC1
    if kPseudoGB1:
        PseudoGB1=u'* [[w:GB 12345|伪GB码]]:'+wikipedia.html2unicode(kPseudoGB1)+u'\n'
        wikitext= wikitext+PseudoGB1
    if kMainlandTelegraph or kTaiwanTelegraph:
        wikitext= wikitext+u'* [[w:中文电码|]]:\n'
    if kMainlandTelegraph:
        MainlandTelegraph=u'** 中国大陆:'+wikipedia.html2unicode(kMainlandTelegraph)+u'\n'
        wikitext= wikitext+MainlandTelegraph
    if kTaiwanTelegraph:
        TaiwanTelegraph=u'** 台湾:'+wikipedia.html2unicode(kTaiwanTelegraph)+u'\n'
        wikitext= wikitext+TaiwanTelegraph

#定义
    zht=''
    jpt=''
    if kDefinition:
        ds=kDefinition.split("; ")
        deflist=[]
        jdeflist=[]
        for d in ds:
            dlist=[]
            clist=[]
            jlist=[]
            cant=None
            if '(Cant.)' not in d and '(J)' not in d:
                defs=d.split(', ')
                for ddef in defs:
                    if ' ' not in ddef:
                        ddef=u'[['+wikipedia.html2unicode(ddef)+u']]'
                        dlist.append(ddef)
                Definition=u','.join(dlist)
                if Definition:
                    deflist.append(Definition)
            if '(Cant.)' in d:
                cants=d[8:].split(', ')
                for c in cants:
                    if ' ' not in c:
                        cc=u'[['+wikipedia.html2unicode(c)+u']]'
                        clist.append(cc)
                if clist:
                    cant= u'【粤语】'+u','.join(clist)
                if cant:
                    deflist.append(cant)
            if  '(J)' in d:
                js=d[4:].split(', ')
                for j in js:
                    if ' ' not in j:
                        jj=u'[['+wikipedia.html2unicode(j)+u']]'
                        jlist.append(jj)
                jp= u','.join(jlist)
                if jp:
                    jdeflist.append(jp)
        zht=u';'.join(deflist)
        jpt=u';'.join(jdeflist)

        zhtran=u'{{-trans-}}\n{{transl|\n* {{en}}:'+zht+u'\n}}\n'
        jptran=u'{{-trans-}}\n{{transl|\n* {{en}}:'+jpt+u'\n}}\n'
        
    cedict=open('cedict_ts.u8')
    lines = cedict.readlines()
    cedict.close()
    linelist=[]
    linenlist=[]
    for line in lines:
        if hanzi in line:
            l=line.split(u' ',2)
            if len(l[0])>1:
                if l[0][0]==hanzi or l[1][0] == hanzi: 
            
                    linelist.append(l)
                elif l[0][-1]==hanzi or l[1][-1] == hanzi:
                    linenlist.append(l)
    #词

    wordz=''
    wordn=''
    if linelist:
        for li in linelist:
            if li[0] ==li[1]:
                wordz = wordz + u'* -{[['+wikipedia.html2unicode(li[0])+u']]}-\n'
            else:            
                if hanzi in li[0]:
                    wordz = wordz + u'* -{[['+wikipedia.html2unicode(li[0])+u']]}-\n'
                if hanzi in li[1]:
                    wordz = wordz+ u'* -{[['+wikipedia.html2unicode(li[1])+u']]}-\n'
    if linenlist:
        for li in linenlist:
            if li[0] ==li[1]:
                wordn = wordn + u'* -{[['+wikipedia.html2unicode(li[0])+u']]}-\n'
            else:            
                if hanzi in li[0]:
                    wordn = wordn + u'* -{[['+wikipedia.html2unicode(li[0])+u']]}-\n'
                if hanzi in li[1]:
                    wordn = wordn+ u'* -{[['+wikipedia.html2unicode(li[1])+u']]}-\n'       


    if kCantonese or kHanyuPinyin or kTang or zht or wordz or  wordn:
        wikitext=wikitext+u'{{-zh-}}\n[[Category:{{subst:zh}}|]]\n<big>-{\'\'\''+hanzi+u'\'\'\'}-</big>\n'
        zhwppg=wikipedia.Page(sitezh, hanzi) 
        if zhwppg.exists():
            wikitext=wikitext+u'{{wikipedia}}\n'
    if kCantonese or kHanyuPinyin or kTang:
        wikitext=wikitext+u'{{-pron-}}\n'
#读音
    if kHanyuPinyin:
        HanyuPinyin=u'* [[Wiktionary:汉语拼音索引|汉语拼音]]:'+wikipedia.html2unicode(kHanyuPinyin.split(":")[1])+u'\n'
        wikitext= wikitext+HanyuPinyin
    if kCantonese:
        Cantonese=u'* [[w:粵拼|]]:'+wikipedia.html2unicode(kCantonese)+u'\n'
        wikitext= wikitext+Cantonese
    if kTang:
        if '*' in kTang:
            Tang=u'* 唐音:'+wikipedia.html2unicode(kTang[1:])+u'\n'
            wikitext= wikitext+Tang
        else:
            Tang=u'* 唐音:'+wikipedia.html2unicode(kTang)+u'\n'
            wikitext= wikitext+Tang
    if zht:
        wikitext=wikitext+zhtran
    if linelist or  linenlist:
        wikitext=wikitext+u'=== 组词 ===\n'
    if linelist:
        wikitext=wikitext+wordz
    if linenlist:
        
        wikitext=wikitext+u';逆序\n'+wordn
    

    if jpt or kJapaneseKun or kJapaneseOn:
        wikitext=wikitext+u'{{-ja-}}\n[[Category:{{subst:ja}}|]]\n<big>\'\'\'{{lang|ja|'+hanzi+u'}}\'\'\'</big>\n'
        jawppg=wikipedia.Page(siteja, hanzi) 
        if jawppg.exists():
            wikitext=wikitext+u'{{wikipedia|lang=ja}}\n'
    if kJapaneseKun or kJapaneseOn:
        wikitext=wikitext+u'{{-pron-}}\n'
    if kJapaneseKun:
        JapaneseKun=u'* [[w:訓讀|]]:'+wikipedia.html2unicode(kJapaneseKun.lower())+u'\n'
        wikitext= wikitext+JapaneseKun
    if kJapaneseOn:
        JapaneseOn=u'* [[w:音讀|]]:'+wikipedia.html2unicode(kJapaneseOn.lower())+u'\n'
        wikitext= wikitext+JapaneseOn
    if jpt:
        wikitext=wikitext+jptran

    if kHangul or kKorean:
        wikitext=wikitext+u'{{-ko-}}\n[[Category:{{subst:ko}}|]]\n<big>\'\'\'{{lang|ko|'+hanzi+u'}}\'\'\'</big>\n'+u'{{-pron-}}\n'
        kowppg=wikipedia.Page(siteko, hanzi) 
        if kowppg.exists():
            wikitext=wikitext+u'{{wikipedia|lang=ko}}\n'
    if kHangul:

        Hangul=u'* [[Wiktionary:諺文索引|諺文]]:[['+wikipedia.html2unicode(kHangul)+u']]\n'
        wikitext= wikitext+Hangul
    if kKorean:
        Korean=u'* 耶鲁式:'+wikipedia.html2unicode(kKorean.lower())+u'\n'
        wikitext= wikitext+Korean

    if kVietnamese:
        wikitext=wikitext+u'{{-vi-}}\n[[Category:{{subst:vi}}|]]\n<big>\'\'\'{{lang|vi|'+hanzi+u'}}\'\'\'</big>\n'+u'{{-pron-}}\n'
        viwppg=wikipedia.Page(sitevi, hanzi) 
        if viwppg.exists():
            wikitext=wikitext+u'{{wikipedia|lang=vi}}\n'
        Vietnamese=u'* [['+wikipedia.html2unicode(kVietnamese)+u']]\n'
        wikitext= wikitext+Vietnamese
    wikitext=wikitext+u'[[en:'+hanzi+u']]\n'+u'[[ja:'+hanzi+u']]\n'
    
    pg=wikipedia.Page(site, hanzi)
    if pg.exists() and not pg.isRedirectPage():
        pg=wikipedia.Page(site, u'User:Sz-iwbot/Unihan/%s' % hanzi)
        pg.put(wikitext,u"[[%s]]已存在,临时存放等待合并" % hanzi)
        wikipedia.output(u'[[%s]]已存在,临时存放[[user:Sz-iwbot/Unihan/%s]]等待合并' % (hanzi, hanzi))

    else:
        pg.put(wikitext,u"增加\"%s\"的unihan数据" % hanzi)
        wikipedia.output(u'增加[[%s]]的unihan数据' % hanzi)
        
    log = open('unihan.dat','w')
    log.write(str(x))
    log.close()
   
    randomsleep=random.randint(1,100)
    wikipedia.output('sleep %ss' % str(randomsleep))
    time.sleep(randomsleep)
wikipedia.stopme()