Benutzer Diskussion:Klausi/Umlaut-Bot

Aus VoWi
Zur Navigation springen Zur Suche springen

Etwaige Verbesserungsvorschläge zum Umlautbot[Quelltext bearbeiten]

Dies sollte aber die Semantik praktisch nicht verändern.
Besser wär's wenn man alle Ersetzungsmöglichkeiten durchprobieren würde, und nicht einfach alle Umlaute ersetzt. Das muss ich mir noch überlegen.

#!/usr/bin/env python
# -*- coding: utf-8 -*-

## Copyright (C) 2007 klausi <klausi[ät]fsinf.at>, panzi <e0427417[ät]student.tuwien.ac.at>
##
## umlaut-bot is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published
## by the Free Software Foundation; version 3 or any later version.

import re
import popen2
import wikipedia # Import the wikipedia module

UMLS = {
	"ae":u"ä",
	"oe":u"ö",
	"ue":u"ü",
	"Ae":u"Ä",
	"Oe":u"Ö",
	"Ue":u"Ü",
	"sz":u"ß",
	"ss":u"ß"
}

UML_WORD = re.compile(r"(?:ht|f)tp://\S+|(?P<word>\w*(?:%s)\w*)" % '|'.join(UMLS.iterkeys()))

class Aspell(object):
	def __init__(self):
		self._f = popen2.Popen3("aspell -a -l de")
		self._f.fromchild.readline() # skip the credit line
	
	def __call__(self, word):
		word = word.encode("UTF-8")
		self._f.tochild.write(word+'\n')
		self._f.tochild.flush()
		s = self._f.fromchild.readline().strip()
		self._f.fromchild.readline() # skip the blank line
		return s == "*"

def main():
	site = wikipedia.getSite() # Taking the default site
	startpage = '!'
	for page in site.allpages(startpage):
		title = page.title()
		print title
		if page.isRedirectPage():
			continue
		newtitle = replace(title)
		if newtitle is not None:
			page.move(newtitle)
		text    = page.get()
		newtext = replace(text)
		if newtext is not None:
			page.put(newtext, comment=u'Bot: Umlaute einführen', minorEdit = True)

def replace(text):
	shared = {'changed':False}
	def _replace():
		index = 0
		for match in UML_WORD.finditer(text):
			word = match.group('word')
			if word and len(word) > 2 and not aspell(word):
				newword = replaceInWord(word)
				if aspell(newword):
					yield text[index:match.start()]
					yield newword
					index = match.end()
					shared['changed'] = True
		yield text[index:]
	
	newtext = ''.join(_replace())
	
	if shared['changed']:
		return newtext
	else:
		return None

def replaceInWord(word):
	for raw, umlch in UMLS.iteritems():
		word = word.replace(raw,umlch)
	return word

aspell = Aspell()

if __name__ == '__main__':
	try:
		main()
	finally:
		wikipedia.stopme()

Und hier werden alle möglichen Umlaut-Zuweisungen durchprobiert[Quelltext bearbeiten]

#!/usr/bin/env python
# -*- coding: utf-8 -*-

## Copyright (C) 2007 klausi <klausi[ät]fsinf.at>, panzi <e0427417[ät]student.tuwien.ac.at>
##
## umlaut-bot is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published
## by the Free Software Foundation; version 3 or any later version.

import re
import popen2
import wikipedia # Import the wikipedia module
from itertools import islice

UMLS = {
	"ae":u"ä",
	"oe":u"ö",
	"ue":u"ü",
	"Ae":u"Ä",
	"Oe":u"Ö",
	"Ue":u"Ü",
	"sz":u"ß",
	"ss":u"ß"
}

UML_WORD = re.compile(r"(?:ht|f)tp://\S+|(?P<word>\w*(?:%s)\w*)" % '|'.join(UMLS.iterkeys()))
UML_CHAR = re.compile('|'.join(UMLS.iterkeys()))

class Aspell(object):
	__slots__ = '_f',
	
	def __init__(self):
		self._f = popen2.Popen3(['aspell', '-a', '-l', 'de', '--encoding=UTF-8', '--dont-suggest'])
		self._f.fromchild.readline() # skip the credit line
	
	def __call__(self, word):
		word = word.encode("UTF-8")
		self._f.tochild.write(word+'\n')
		self._f.tochild.flush()
		s = self._f.fromchild.readline().strip()
		self._f.fromchild.readline() # skip the blank line
		return s == "*"

def main():
	site = wikipedia.getSite() # Taking the default site
	startpage = '!'
	for page in site.allpages(startpage):
		title = page.title()
		print title
		if page.isRedirectPage():
			continue
		newtitle = replace(title)
		if newtitle is not None:
			page.move(newtitle)
		text    = page.get()
		newtext = replace(text)
		if newtext is not None:
			page.put(newtext, comment=u'Bot: Umlaute einführen', minorEdit = True)

def replace(text):
	shared = {'changed':False}
	def _replace():
		index = 0
		for match in UML_WORD.finditer(text):
			word = match.group('word')
			if word and len(word) > 2 and not aspell(word):
				for newword in islice(replaceInWord(word),1,None):
					if aspell(newword):
						yield text[index:match.start()]
						yield newword
						index = match.end()
						shared['changed'] = True
						break
		yield text[index:]
	
	newtext = ''.join(_replace())
	
	if shared['changed']:
		return newtext
	else:
		return None

def replaceInWord(word):
	parts = []
	umls  = []
	index = 0
	for match in UML_CHAR.finditer(word):
		uml = match.group()
		parts.append(word[index:match.start()])
		umls.append(uml)
		index = match.end()
	parts.append(word[index:])
	
	if umls:
		part = parts[0]
		for rest in assignUmls(parts[1:],umls):
			yield part + rest
	else:
		yield word

def assignUmls(parts,umls):
	part   = parts[0]
	raw    = umls[0]
	rparts = parts[1:]
	rumls  = umls[1:]
	
	if not rparts:
		yield raw       + part
		yield UMLS[raw] + part
		
	else:
		prefix = raw + part
		for rest in assignUmls(rparts,rumls):
			yield prefix + rest
		
		prefix = UMLS[raw] + part
		for rest in assignUmls(rparts,rumls):
			yield prefix + rest

aspell = Aspell()

if __name__ == '__main__':
	try:
		main()
	finally:
		wikipedia.stopme()


Falsche Ersätzung[Quelltext bearbeiten]

aufzubauen -> aufzubaün

Liegt das ggf. ans aspell? 132.183.139.80 22:36, 3. Feb. 2011 (CET)