Benutzer Diskussion:Klausi/Umlaut-Bot
Zur Navigation springen
Zur Suche springen
Etwaige Verbesserungsvorschläge zum Umlautbot[Quelltext bearbeiten]
Dies sollte aber die Semantik praktisch nicht verändern.
Besser wär's wenn man alle Ersetzungsmöglichkeiten durchprobieren würde, und nicht einfach alle Umlaute ersetzt. Das muss ich mir noch überlegen.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
## Copyright (C) 2007 klausi <klausi[ät]fsinf.at>, panzi <e0427417[ät]student.tuwien.ac.at>
##
## umlaut-bot is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published
## by the Free Software Foundation; version 3 or any later version.
import re
import popen2
import wikipedia # Import the wikipedia module
UMLS = {
"ae":u"ä",
"oe":u"ö",
"ue":u"ü",
"Ae":u"Ä",
"Oe":u"Ö",
"Ue":u"Ü",
"sz":u"ß",
"ss":u"ß"
}
UML_WORD = re.compile(r"(?:ht|f)tp://\S+|(?P<word>\w*(?:%s)\w*)" % '|'.join(UMLS.iterkeys()))
class Aspell(object):
def __init__(self):
self._f = popen2.Popen3("aspell -a -l de")
self._f.fromchild.readline() # skip the credit line
def __call__(self, word):
word = word.encode("UTF-8")
self._f.tochild.write(word+'\n')
self._f.tochild.flush()
s = self._f.fromchild.readline().strip()
self._f.fromchild.readline() # skip the blank line
return s == "*"
def main():
site = wikipedia.getSite() # Taking the default site
startpage = '!'
for page in site.allpages(startpage):
title = page.title()
print title
if page.isRedirectPage():
continue
newtitle = replace(title)
if newtitle is not None:
page.move(newtitle)
text = page.get()
newtext = replace(text)
if newtext is not None:
page.put(newtext, comment=u'Bot: Umlaute einführen', minorEdit = True)
def replace(text):
shared = {'changed':False}
def _replace():
index = 0
for match in UML_WORD.finditer(text):
word = match.group('word')
if word and len(word) > 2 and not aspell(word):
newword = replaceInWord(word)
if aspell(newword):
yield text[index:match.start()]
yield newword
index = match.end()
shared['changed'] = True
yield text[index:]
newtext = ''.join(_replace())
if shared['changed']:
return newtext
else:
return None
def replaceInWord(word):
for raw, umlch in UMLS.iteritems():
word = word.replace(raw,umlch)
return word
aspell = Aspell()
if __name__ == '__main__':
try:
main()
finally:
wikipedia.stopme()
Und hier werden alle möglichen Umlaut-Zuweisungen durchprobiert[Quelltext bearbeiten]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
## Copyright (C) 2007 klausi <klausi[ät]fsinf.at>, panzi <e0427417[ät]student.tuwien.ac.at>
##
## umlaut-bot is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published
## by the Free Software Foundation; version 3 or any later version.
import re
import popen2
import wikipedia # Import the wikipedia module
from itertools import islice
UMLS = {
"ae":u"ä",
"oe":u"ö",
"ue":u"ü",
"Ae":u"Ä",
"Oe":u"Ö",
"Ue":u"Ü",
"sz":u"ß",
"ss":u"ß"
}
UML_WORD = re.compile(r"(?:ht|f)tp://\S+|(?P<word>\w*(?:%s)\w*)" % '|'.join(UMLS.iterkeys()))
UML_CHAR = re.compile('|'.join(UMLS.iterkeys()))
class Aspell(object):
__slots__ = '_f',
def __init__(self):
self._f = popen2.Popen3(['aspell', '-a', '-l', 'de', '--encoding=UTF-8', '--dont-suggest'])
self._f.fromchild.readline() # skip the credit line
def __call__(self, word):
word = word.encode("UTF-8")
self._f.tochild.write(word+'\n')
self._f.tochild.flush()
s = self._f.fromchild.readline().strip()
self._f.fromchild.readline() # skip the blank line
return s == "*"
def main():
site = wikipedia.getSite() # Taking the default site
startpage = '!'
for page in site.allpages(startpage):
title = page.title()
print title
if page.isRedirectPage():
continue
newtitle = replace(title)
if newtitle is not None:
page.move(newtitle)
text = page.get()
newtext = replace(text)
if newtext is not None:
page.put(newtext, comment=u'Bot: Umlaute einführen', minorEdit = True)
def replace(text):
shared = {'changed':False}
def _replace():
index = 0
for match in UML_WORD.finditer(text):
word = match.group('word')
if word and len(word) > 2 and not aspell(word):
for newword in islice(replaceInWord(word),1,None):
if aspell(newword):
yield text[index:match.start()]
yield newword
index = match.end()
shared['changed'] = True
break
yield text[index:]
newtext = ''.join(_replace())
if shared['changed']:
return newtext
else:
return None
def replaceInWord(word):
parts = []
umls = []
index = 0
for match in UML_CHAR.finditer(word):
uml = match.group()
parts.append(word[index:match.start()])
umls.append(uml)
index = match.end()
parts.append(word[index:])
if umls:
part = parts[0]
for rest in assignUmls(parts[1:],umls):
yield part + rest
else:
yield word
def assignUmls(parts,umls):
part = parts[0]
raw = umls[0]
rparts = parts[1:]
rumls = umls[1:]
if not rparts:
yield raw + part
yield UMLS[raw] + part
else:
prefix = raw + part
for rest in assignUmls(rparts,rumls):
yield prefix + rest
prefix = UMLS[raw] + part
for rest in assignUmls(rparts,rumls):
yield prefix + rest
aspell = Aspell()
if __name__ == '__main__':
try:
main()
finally:
wikipedia.stopme()
Falsche Ersätzung[Quelltext bearbeiten]
aufzubauen -> aufzubaün
Liegt das ggf. ans aspell? 132.183.139.80 22:36, 3. Feb. 2011 (CET)