1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
| #!/usr/bin/python
"""
Script to generate URS from the starting paragraph of Wikipedia
articles about persons.
by Pravin Paratey (pravinp -at- gmail.com)
Current Implementation:
----------------------
1. Extract first sentence
2. Clean wiki markup
3. Observing given data, and the data on wikipedia, shows that there
is a pattern that is followed while writing wikipedia entries for
persons. Replacing (was/is)(an/a/the/) with (/the) does the trick
4. Output sentence formed
Ideally:
--------
Ideally, the piece of code should identify the following concepts:
1. Name of person
2. Time period
3. Son/Daughter/Father/Mother of (in case of famous personality)
4. Renowned for
How do we go about it?
1 and 2 - straight forward. Wikipedia gives cues through its markup
3 - straight forward. String matching using "son of", "daughter of", etc
4 - will need to match against a database.
For 3, we only keep the "son of", "daughter of", "X of Y" if Y is a prominent
person. An easy way of doing this is using incoming links on wikipedia OR
to search for X and Y individually on google and noting the number of results.
"""
import re, sys, codecs
def cleanUri(m):
""" Cleans Uri wiki markup """
word = m.group(1)
if '|' in word: word = word.split('|')[1]
return word.strip()
def dotRemove(m):
""" Replaces . by # inside tags """
return m.group(0).replace('.', '#')
def cleanMarkup(text):
""" Removes
1. wiki markup
2. sanitize html entities
3. comments """
#text = re.sub(r"\[\[[\w\s\-,]+\|(\w+)\]\]", r"\1", text)
text = re.sub(r"\[\[(.*?)\]\]", cleanUri, text)
text = re.sub(r"\{\{.*?\}\}", r"", text)
text = re.sub(r"<ref>.*?<\/ref>", r"", text)
text = re.sub(r"<!--.*?-->", r"", text)
text = re.sub(r"\[.*?\]", r"", text)
text = text.replace("'''", "").replace("''", "'")
text = text.replace("[[", "").replace("]]", "")
text = text.replace("–", "-").replace("&", "&")
return text
def getFirstSentence(text):
""" Returns the text until first instance of '.'
It also makes sure that the '.' isn't part of a wiki link
or name"""
tmp = re.sub(r"\[\[.*?\]\]", dotRemove, text)
tmp = re.sub(r"\[.*?\]", dotRemove, tmp)
tmp = re.sub(r"<ref>.*?<\/ref>", dotRemove, tmp)
tmp = re.sub(r"<!--.*?-->", dotRemove, tmp)
tmp = re.sub(r"'''.*?'''", dotRemove, tmp)
tmp = re.sub(r"''.*?''", dotRemove, tmp)
index = tmp.find('.')
if index == -1:
return text
else:
return text[:index]
def makeArticle(m):
""" Changes a, an to the when appropriate """
retval = ', the'
if len(m.group(2)) == 0:
retval = ' '
return retval
def extractURS(text):
""" The function to call. Returns the URS """
text = getFirstSentence(text)
text = cleanMarkup(text)
text = re.sub(r",?\s+(was|is)\s+(an|the|a|)", makeArticle, text)
return text
if __name__ == '__main__':
#fp = open(sys.argv[1])
fp = codecs.open("input.txt", "r", "utf-8")
fp2 = codecs.open("output.txt", "w", "utf-8")
fp2.write(codecs.BOM_UTF8.decode("utf-8")), # Add BOM for UTF-8
for line in fp:
line = line.rstrip()
if len(line) == 0 or line.startswith("#"): # For debugging
continue
urs = extractURS(line)
fp2.write(urs + '\r\n')
fp.close()
fp2.close() |