summaryrefslogtreecommitdiffstats
path: root/tools2/translate_index.py
blob: 43a1eec845e43a4a15b09c4d3daa3c248856874f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import sys
from urllib import urlopen
from urllib import FancyURLopener
import codecs
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
# NOTE: this script use BeutifulSoup 3. Is deprecated and the api 
# is incopatible with the last version
# docs are here:
# http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
import shutil
import json

# read the list of articles


def normalize_title(title):
    s = title.strip().replace(' ', '_')
    return s[0].capitalize() + s[1:]


class FileListReader():

    def __init__(self, file_name):
        _file = codecs.open(file_name,
                                encoding='utf-8', mode='r')
        self.list = []
        line = _file.readline()
        while line:
            self.list.append(normalize_title(line))
            line = _file.readline()


class CustomUrlOpener(FancyURLopener):

    version = 'Mozilla/5.0 (X11; Linux x86_64; rv:9.0) Gecko/20100101 ' + \
            'Firefox/9.0'

if len(sys.argv) < 2:
    print "Use as ../translate_index.py lang"
    exit()
else:
    translate_to = sys.argv[1]
    print "Translating to", translate_to

favorites_reader = FileListReader('../en/favorites_en.txt')

new_favorites_name = './favorites_%s.txt' % translate_to

trans_dict = {}
dictionary_file_name = 'en_to_%s_dictionary.json' % translate_to

if not os.path.exists(new_favorites_name):
    new_favorites = codecs.open(new_favorites_name,
                                    encoding='utf-8', mode='w')

    for article in favorites_reader.list:
        url_article = "http://en.wikipedia.org/wiki/%s" % article
        print article,
        opener = CustomUrlOpener()
        html = opener.open(url_article)
        #print html
        soup = BeautifulSoup(html)
        li = soup.findAll('li', {'class': 'interwiki-%s' % translate_to})
        if li:
            for attr in li[0].contents[0].attrs:
                if attr[0] == 'title':
                    translation = attr[1]
                    trans_dict[article] = translation
                    print '=', translation.encode('utf8')
                    new_favorites.write('%s\n' % translation)
        else:
            print
    new_favorites.close()

    # save the translation dictionary
    # the dictionary is saved to be able to process the index.html
    # at a later stage without need do the slow download from wikipedia
    # every time
    print "saving translations as dictionary"
    dictionary_file = codecs.open(dictionary_file_name,
                                    encoding='utf-8', mode='w')
    json.dump(trans_dict, dictionary_file)
    dictionary_file.close()

print "Opening index_en.html"

if not trans_dict:
    # read the translation dictionary
    try:
        dictionary_file = codecs.open(dictionary_file_name,
                                        encoding='utf-8', mode='r')
        trans_dict = json.load(dictionary_file)
        dictionary_file.close()
    except:
        print "Can't read the dictionary file", dictionary_file_name
        exit()

html_index_data = open('../static/index_en.html').read()
#shutil.copyfile(, './index_%s.html' % translate_to)

index_soup = BeautifulSoup(html_index_data)
for link in index_soup.findAll('a'):
    article_link = unicode(normalize_title(link['href'][6:]))

    if article_link in trans_dict:
        #try:
        link['href'] = '/wiki/%s' % trans_dict[article_link]
        link['title'] = trans_dict[article_link]
        link.find(text=link.text).replaceWith(trans_dict[article_link])
        #except:
        #    print "translation not found"
        #    pass
    else:
        link['class'] = 'TRANS_PENDING'

style = Tag(index_soup, "style")
index_soup.html.head.insert(1, style)
style_text = NavigableString('.TRANS_PENDING {background-color:red;}')
style.insert(0, style_text)

translated_index = open('./index_%s.html' % translate_to, 'w')
translated_index.write(str(index_soup))
translated_index.close()