Utente:BotSottile/user-fixes.py

# -*- coding: utf-8 -*-
#
# Il codice può essere inserito in user-fixes.py (createlo se non esiste).
#
# Comando di esempio:
#
# replace.py -namespace:0 -xml:itwiki-20080418-pages-meta-current.xml -fix:errori_comuni
 
fixes['errori_comuni'] = {
       'regex': True,
       'recursive': True,
       'msg': {
              'it':u'Bot: Correzione di uno o più [[Utente:IagaBot/Errori_comuni|errori comuni]]'
             },
       'replacements': [

(u'(\\w)(  |   )(\\w)', ur'\1 \3'),
(u'([a-z]|[ìèéùòà\)]) ?\\.([A-Z]|È)', ur'\1. \2'),
(u'([a-z]|[ìèéùòà\)]) ?,([a-z]|")', ur'\1, \2'),
(u'([a-z]|[ìèéùòà\)]) ?(:|;)(\\w|")', ur'\1\2 \3'),
(u'(\\w|[ìèéùòà\)]) (,|\\.|:|;)', ur'\1\2'),
(u'(\\b|\\.)E\' ', ur'\1È '),
(u'\\b([Uu])n\'amico\\b', ur'\1n amico'),
(u'\\b(\\w+)zzion(\\w+)\\b', ur'\1zion\2'),
(u'\\b([aA])(bben|ccioc)chè\\b', ur'\1\2ché'),
(u'\\b([aA])(ffin|ncor|nzi|tteso)chè\\b', ur'\1\2ché'),
(u'\\b([aA])ccellera(re|zione)\\b', ur'\1ccelera\2'),
(u'\\b([aA])e?reoport(o|i)\\b', ur'\1eroport\2'),
(u'\\b([aA])ggiottaggio\\b', ur'\1ggiotaggio'),
(u'\\b([aA])l(cun|lor|tro)chè\\b', ur'\1l\2ché'),
(u'\\b([aA])ltretanto\\b', ur'\1ltrettanto'),
(u'\\b([aA])ppropiat(a|e|i|o)\\b', ur'\1ppropriat\2'),
(u'\\b([aA])pprovigionamento\\b', ur'\1pprovvigionamento'),
(u'\\b([aA])quistare\\b', ur'\1cquistare'),
(u'\\b([aA])vve(gna|gnadio|nga|ngadio)chè\\b', ur'\1vve\2ché'),
(u'\\b([bB])enchè\\b', ur'\1enché'),
(u'\\b([cC])(hec|ioc|omec|onciofosse|ontutto|osic|otal)chè\\b', ur'\1\2ché'),
(u'\\b([cC])osidett(o|i|e|a)\\b', ur'\1osiddett\2'),
(u'\\b([cC])ospiqu(o|i|e|a)\\b', ur'\1ospicu\2'),
(u'\\b([cC]om|)([pP])ropi(o|età|etari|etari[aeo])\\b', ur'\1\2ropri\3'),
(u'\\b([dD])(ac|appoi|imodo|opo|opodi)chè\\b', ur'\1\2ché'),
(u'\\b([dD]e|[cC]oef)ficen(za|te|ti)\\b', ur'\1ficien\2'),
(u'\\b([eE])ssendochè\\b', ur'\1ssendoché'),
(u'\\b([eE])vaqua(re|[t][oiae]|zione)\\b', ur'\1vacua\2'),
(u'\\b([fF])(inattanto|intanto|inac|inattanto|in|uor)chè\\b', ur'\1\2ché'),
(u'\\b([gG])(iac|ran|iafosse|iafossecosa)chè\\b', ur'\1\2ché'),
(u'\\b([iI])(nfinattanto|nquanto)chè\\b', ur'\1\2ché'),
(u'\\b([iI])gen(e|ic[oai]|iche)\\b', ur'\1gien\2'),
(u'\\b([iI])ngenier(e|i)\\b', ur'\1ngegner\2'),
(u'\\b([iI])nnoqu(i|o|a|e)\\b', ur'\1nnocu\2'),
(u'\\b([iI])nzio\\b', ur'\1nizio'),
(u'\\b([iI]l|)([lL])eggittim(o|i|e|a)\\b', ur'\1\2egittim\3'),
(u'\\b([iI]n|)([cC])oscen(za|te|ti)\\b', ur'\1\2oscien\3'),
(u'\\b([iI]n|)([sS])ufficen(za|te|ti)\\b', ur'\1\2ufficien\3'),
(u'\\b([lL])orchè\\b', ur'\1orché'),
(u'\\b([mM])acchè\\b', ur'\1acché'),
(u'\\b([mM])etereologi(a|co|ci|che)\\b', ur'\1eteorologi\2'),
(u'\\b([nN])on(so|)chè\\b', ur'\1on\2ché'),
(u'\\b([oO])(ltre|nde)chè\\b', ur'\1\2ché'),
(u'\\b([oO])nniscen(za|te|ti)\\b', ur'\1nniscien\2'),
(u'\\b([oO])noreficenza\\b', ur'\1norificenza'),
(u'\\b([oO])vverossia\\b', ur'\1vverosia'),
(u'\\b([pP])(oi|oscia|resso|ur)chè\\b', ur'\1\2ché'),
(u'\\b([pP])aralello\\b', ur'\1arallelo'),
(u'\\b([pP])er(cioc|lo|oc|)chè\\b', ur'\1er\2ché'),
(u'\\b([pP])iú\\b', ur'\1iù'),
(u'\\b([pP])rospicent(e|i)\\b', ur'\1rospicient\2'),
(u'\\b([pP])roveniendo\\b', ur'\1rovenendo'),
(u'\\b([pP]rofi|[pP]romis)qu(o|a|e|i)\\b', ur'\1cu\2'),
(u'\\b([qQ])ua(nto|si)chè\\b', ur'\1ua\2ché'),
(u'\\b([rR]i|)([cC])onoscien(za|te|ti)\\b', ur'\1\2onoscen\3'),
(u'\\b([sS])(econdo|ennon|enon|tante)chè\\b', ur'\1\2ché'),
(u'\\b([sS])cenz([ae])\\b', ur'\1cienz\2'),
(u'\\b([sS])enonch[èé]\b', ur'\1ennonch\2'),
(u'\\b([sS])i(c|nattanto|n|ntanto)chè\\b', ur'\1i\2ché'),
(u'\\b([sS])oprattuto\\b', ur'\1oprattutto'),
(u'\\b([sS])triscie\\b', ur'\1trisce'),
(u'\\b([sS])uperfice\\b', ur'\1uperficie'),
(u'\\b([tT])(al|almente|anto|ranne|utto)chè\\b', ur'\1\2ché'),
(u'\\b([tT])errittorio\\b', ur'\1erritorio'),
(u'<<', ur'«'),
(u'>>', ur'»'),
(u'[Kk]m[q2]', ur'km²')

       ],
        'exceptions': {
                    'inside-tags': [
                                'hyperlink',    
                                'link',
                                'comment',
                                'timeline',
                                'gallery',
                                'math',
                                'pre',
                                'startspace',
                                'source', 
                                'nowiki'
                        ] ,
                   'inside': [
                                r'(?s)<[^>]+>',
                                r'(?s)\{[^\}]+\}',
                                r'&[^;]+;',
                                r'\[([\w\W])+\]',
                                r'(a|d).C.',
                                r'\"([\w\W])+\"',
                                r'(?i)(s\.n\.c|s\.r\.l|s\.a\.s|s\.p\.a)',
                                r'[Mm](r|iss|ister)\.\w',
                                r'\.NET',
                                r'[Ss]t\.\w'
                       ],
                    'text-contains': [
                                'IagaBot'
                        ]
        }
   }

fixes['tutti'] = {
       'regex': True,
       'recursive': True,
       'msg': {
              'it':u'Bot: Correzione di uno o più [[Utente:IagaBot/Errori_comuni|errori comuni]]'
             },
       'replacements': [
 
(u'([a-z]|[ìèéùòà\)]) ?\\.([A-Z]|È)', ur'\1. \2'),
(u'([a-z]|[ìèéùòà\)]) ?,([a-z]|")', ur'\1, \2'),
(u'([a-z]|[ìèéùòà\)]) ?(:|;)(\\w|")', ur'\1\2 \3'),
(u'(\\w|[ìèéùòà\)]) (,|\\.|:|;)', ur'\1\2'),
(u'(\\b|\\.)E\' ', ur'\1È '),
(u'\\b([Uu])n\'amico\\b', ur'\1n amico'),
(u'\\b(\\w+)zzion(\\w+)\\b', ur'\1zion\2'),
(u'\\b([aA])(bben|ccioc)chè\\b', ur'\1\2ché'),
(u'\\b([aA])(ffin|ncor|nzi|tteso)chè\\b', ur'\1\2ché'),
(u'\\b([aA])ccellera(re|zione)\\b', ur'\1ccelera\2'),
(u'\\b([aA])e?reoport(o|i)\\b', ur'\1eroport\2'),
(u'\\b([aA])ggiottaggio\\b', ur'\1ggiotaggio'),
(u'\\b([aA])l(cun|lor|tro)chè\\b', ur'\1l\2ché'),
(u'\\b([aA])ltretanto\\b', ur'\1ltrettanto'),
(u'\\b([aA])ppropiat(a|e|i|o)\\b', ur'\1ppropriat\2'),
(u'\\b([aA])pprovigionamento\\b', ur'\1pprovvigionamento'),
(u'\\b([aA])quistare\\b', ur'\1cquistare'),
(u'\\b([aA])vve(gna|gnadio|nga|ngadio)chè\\b', ur'\1vve\2ché'),
(u'\\b([bB])enchè\\b', ur'\1enché'),
(u'\\b([cC])(hec|ioc|omec|onciofosse|ontutto|osic|otal)chè\\b', ur'\1\2ché'),
(u'\\b([cC])osidett(o|i|e|a)\\b', ur'\1osiddett\2'),
(u'\\b([cC])ospiqu(o|i|e|a)\\b', ur'\1ospicu\2'),
(u'\\b([cC]om|)([pP])ropi(o|età|etari|etari[aeo])\\b', ur'\1\2ropri\3'),
(u'\\b([dD])(ac|appoi|imodo|opo|opodi)chè\\b', ur'\1\2ché'),
(u'\\b([dD]e|[cC]oef)ficen(za|te|ti)\\b', ur'\1ficien\2'),
(u'\\b([eE])ssendochè\\b', ur'\1ssendoché'),
(u'\\b([eE])vaqua(re|[t][oiae]|zione)\\b', ur'\1vacua\2'),
(u'\\b([fF])(inattanto|intanto|inac|inattanto|in|uor)chè\\b', ur'\1\2ché'),
(u'\\b([gG])(iac|ran|iafosse|iafossecosa)chè\\b', ur'\1\2ché'),
(u'\\b([iI])(nfinattanto|nquanto)chè\\b', ur'\1\2ché'),
(u'\\b([iI])gen(e|ic[oai]|iche)\\b', ur'\1gien\2'),
(u'\\b([iI])ngenier(e|i)\\b', ur'\1ngegner\2'),
(u'\\b([iI])nnoqu(i|o|a|e)\\b', ur'\1nnocu\2'),
(u'\\b([iI])nzio\\b', ur'\1nizio'),
(u'\\b([iI]l|)([lL])eggittim(o|i|e|a)\\b', ur'\1\2egittim\3'),
(u'\\b([iI]n|)([cC])oscen(za|te|ti)\\b', ur'\1\2oscien\3'),
(u'\\b([iI]n|)([sS])ufficen(za|te|ti)\\b', ur'\1\2ufficien\3'),
(u'\\b([lL])orchè\\b', ur'\1orché'),
(u'\\b([mM])acchè\\b', ur'\1acché'),
(u'\\b([mM])etereologi(a|co|ci|che)\\b', ur'\1eteorologi\2'),
(u'\\b([nN])on(so|)chè\\b', ur'\1on\2ché'),
(u'\\b([oO])(ltre|nde)chè\\b', ur'\1\2ché'),
(u'\\b([oO])nniscen(za|te|ti)\\b', ur'\1nniscien\2'),
(u'\\b([oO])noreficenza\\b', ur'\1norificenza'),
(u'\\b([oO])vverossia\\b', ur'\1vverosia'),
(u'\\b([pP])(oi|oscia|resso|ur)chè\\b', ur'\1\2ché'),
(u'\\b([pP])aralello\\b', ur'\1arallelo'),
(u'\\b([pP])er(cioc|lo|oc|)chè\\b', ur'\1er\2ché'),
(u'\\b([pP])iú\\b', ur'\1iù'),
(u'\\b([pP])rospicent(e|i)\\b', ur'\1rospicient\2'),
(u'\\b([pP])roveniendo\\b', ur'\1rovenendo'),
(u'\\b([pP]rofi|[pP]romis)qu(o|a|e|i)\\b', ur'\1cu\2'),
(u'\\b([qQ])ua(nto|si)chè\\b', ur'\1ua\2ché'),
(u'\\b([rR]i|)([cC])onoscien(za|te|ti)\\b', ur'\1\2onoscen\3'),
(u'\\b([sS])(econdo|ennon|enon|tante)chè\\b', ur'\1\2ché'),
(u'\\b([sS])cenz([ae])\\b', ur'\1cienz\2'),
(u'\\b([sS])enonch[èé]\b', ur'\1ennonch\2'),
(u'\\b([sS])i(c|nattanto|n|ntanto)chè\\b', ur'\1i\2ché'),
(u'\\b([sS])oprattuto\\b', ur'\1oprattutto'),
(u'\\b([sS])uperfice\\b', ur'\1uperficie'),
(u'\\b([tT])(al|almente|anto|ranne|utto)chè\\b', ur'\1\2ché'),
(u'\\b([tT])errittorio\\b', ur'\1erritorio'),
(u'<<', ur'«'),
(u'>>', ur'»'),
(u'[Kk]m[q2]', ur'km²'),

##Da fixes.py
(r'(?i)<b>(.*?)</b>',              r"'''\1'''"),
(r'(?i)<strong>(.*?)</strong>',    r"'''\1'''"),
(r'(?i)<i>(.*?)</i>',              r"''\1''"),
(r'(?i)<em>(.*?)</em>',            r"''\1''"),
(r'(?i)([\r\n])<hr[ /]*>([\r\n])', r'\1----\2'),
(r'(?i)<hr ([^>/]+?)>',            r'<hr \1 />'),
(r'(?i)([\r\n]) *<h1> *([^<]+?) *</h1> *([\r\n])',  r"\1= \2 =\3"),
(r'(?i)([\r\n]) *<h2> *([^<]+?) *</h2> *([\r\n])',  r"\1== \2 ==\3"),
(r'(?i)([\r\n]) *<h3> *([^<]+?) *</h3> *([\r\n])',  r"\1=== \2 ===\3"),
(r'(?i)([\r\n]) *<h4> *([^<]+?) *</h4> *([\r\n])',  r"\1==== \2 ====\3"),
(r'(?i)([\r\n]) *<h5> *([^<]+?) *</h5> *([\r\n])',  r"\1===== \2 =====\3"),
(r'(?i)([\r\n]) *<h6> *([^<]+?) *</h6> *([\r\n])',  r"\1====== \2 ======\3"),

            # external link in double brackets
            (r'\[\[(?P<url>https?://[^\]]+?)\]\]',   r'[\g<url>]'),
            # external link starting with double bracket
            (r'\[\[(?P<url>https?://.+?)\]',   r'[\g<url>]'),
            # external link with forgotten closing bracket
            #(r'\[(?P<url>https?://[^\]\s]+)\r\n',  r'[\g<url>]\r\n'),
            # external link ending with double bracket.
            # do not change weblinks that contain wiki links inside
            # inside the description
            (r'\[(?P<url>https?://[^\[\]]+?)\]\](?!\])',   r'[\g<url>]'),
            # wiki link closed by single bracket.
            # ATTENTION: There are some false positives, for example
            # Brainfuck code examples or MS-DOS parameter instructions.
            # There are also sometimes better ways to fix it than
            # just putting an additional ] after the link.
            (r'\[\[([^\[\]]+?)\](?!\])',  r'[[\1]]'),
            # wiki link opened by single bracket.
            # ATTENTION: same as above.
            (r'(?<!\[)\[([^\[\]]+?)\]\](?!\])',  r'[[\1]]'),
            # template closed by single bracket
            # ATTENTION: There are some false positives, especially in
            # mathematical context or program code.
            (r'{{([^{}]+?)}(?!})',       r'{{\1}}')

       ],
        'exceptions': {
                    'inside-tags': [
                                'hyperlink',    
                                'link',
                                'comment',
                                'timeline',
                                'gallery',
                                'math',
                                'pre',
                                'startspace',
                                'source', 
                                'nowiki'
                        ] ,
                   'inside': [
                                r'(?s)<[^>]+>',
                                r'(?s)\{[^\}]+\}',
                                r'&[^;]+;',
                                r'\[[\w\W]+\]',
                                r'(a|d).C.',
                                r'\"[\w\W]+\"',
                                r'(?i)(s\.n\.c|s\.r\.l|s\.a\.s|s\.p\.a)'
                       ],
                    'text-contains': [
                                r'\[CDATA\[',
                                'IagaBot'
                        ]
        }
   }

fixes['sostituzioni_standard'] = {
       'regex': True,
       'msg': {
              'it':u'Bot: [[Utente:IagaBot/Sostituzioni_standard|sostituzioni standard]]'
             },
       'replacements': [
 
(u'\[\[[Ii]mage:(.*?)\]\]', ur'[[Immagine:\1]]'),
(u'\[\[[cC]ategory:(.*?)\]\]', ur'[[Categoria:\1]]'),
(u'== ?[vV]edi [aA]nche ?==', ur'== Voci correlate =='),
(u'== ?[lL]ink [eE]sterni ?==', ur'== Collegamenti esterni =='),
(u'== ?[vV]oci [Cc]orrelate ?==', ur'== Voci correlate =='),
(u'== ?[cC]ollegamenti [Ee]sterni ?==', ur'== Collegamenti esterni ==')
      
       ]
   }