PythonRegEx

From XPUB & Lens-Based wiki
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Regular expression in Python

A great introduction can be found here http://docs.python.org/2/howto/regex.html#regex-howto

In a nutshell re tries to match certain pattern within a string

pattern

import re #import regular expression module

### compile a pattern ###
p = re.compile('This is'); #define the pattern you are looking for
p.findall('This is something great. No! This is actually shit!') #look for all occurences
Out[35]: ['This is', 'This is']
# but this quite literal, not really taking advantage of regexp power
# for that we can introduce metacharacters . ^ $ * + ? { } [ ] \ | ( )

#lets go through them individually
# [] - character class - which a set of characters inside the []
p = re.compile(r'[is]'); 
p.findall('This is something great. No! This is actually shit!')
Out[37]: ['i', 's', 'i', 's', 's', 'i', 'i', 's', 'i', 's', 's', 'i'] #matched all the i and s characters 

#we can also use character class with ranges inside
p = re.compile(r'[a-d]'); # match all the characters in the range from a to d
p.findall('This is something great. No! This is actually shit!') 
Out[40]: ['a', 'a', 'c', 'a']

#or numbers
p = re.compile(r'[0-5]'); 
p.findall('Th1s i5 s6methin9 9reat. N0! Th15 is actua117 5h17!')
Out[41]: ['1', '5', '0', '1', '5', '1', '1', '5', '1']

#or combine it with the not/exclude ^ metacharacter
p = re.compile(r'[^a-z]'); p.findall('This is something great. No! This is actually shit!') #match everything besides lowercase letter from a to z
Out[45]: ['T', ' ', ' ', ' ', '.', ' ', 'N', '!', ' ', 'T', ' ', ' ', ' ', '!']

#however if you want to select a character that is in a string but also happens to be a metacharacter
#such as select the ^
p = re.compile(r'^'); p.findall('This ^^ is ^^^ something^ greatl')
Out[46]: [''] # nothing was selecter, because re is taking ^ as a metacharater
# we need to escape it, by using \ the ESCAPE the character,
# which will then take ^ as a LITERAL 
p = re.compile(r'\^'); p.findall('This ^^ is ^^^ something^ greatl')
Out[47]: ['^', '^', '^', '^', '^', '^']

#be wary of the \ if follow by a letter, such as in: \d \D \W \w \S \s , or in other words a SEQUENCE
#a SEQUENCE represent A predefined set of characters that are often used

# \d decimal digit
# \D non-decimal digit
# \w alphanumeric character [a-zA-Z0-9_]
# \W non-alphanumeric character [^a-zA-Z0-9_]
# \s white-space character
# \S non-whitespace character
# . any character

#some examples
p = re.compile(r'\d'); p.findall('This is something great1') #get all decimal digitis
Out[50]: ['1']

# sequences can be combined with classes 
p = re.compile(r'[\W, t]'); p.findall('This is something great. No! This is actually shit!') #non-alphanumeric characters and i  
Out[83]: [' ', ' ', 't', ' ', 't', '.', ' ', '!', ' ', ' ', ' ', 't', ' ', 't', '!']

#NOTICE CASE SENSITIVITY
# only the lowercase t, as we indicated, is being selected. If we want both lower and upper case to be matched need to indicated it
Out[83]: [' ', ' ', 't', ' ', 't', '.', ' ', '!', ' ', ' ', ' ', 't', ' ', 't', '!']

p = re.compile('[\W, t]', re.IGNORECASE); p.findall('This is something great. No! This is actually shit!')
Out[86]:  ['T',  ' ',  ' ',  't',  ' ',  't',  '.',  ' ',  '!',  ' ',  'T',  ' ',  ' ',  't',  ' ',  't',  '!']   

# sequences can also be combined with simple character
p = re.compile(r'i..'); p.findall('This is something great. No! This is actually shit!') #i and the two following characters
Out[24]: ['is ', 'is ', 'ing', 'is ', 'is ', 'it!'] 

p = re.compile(r'i\S\S'); p.findall('This is something great. No! This is actually shit!')#i followed by two non-white-space characters
Out[28]: ['ing', 'it!']


# the () means get only what is between () # MATCH GROUP
p = re.compile(r'i(\S\S)'); p.findall('This is something great. No! This is actually shit!') #the two non-white-space characters the follow i; i will not be included
Out[31]: ['ng', 't!']


# REPITITIONS
# RE can also specifiy the number of times a pattern can be matched

# * matches zero or more repititions of that pattern

p = re.compile('c.*?t'); p.findall('cannot connect that to my cat c   o  t ct') #match c followed by any character, repeating 0 or more times
Out[71]: ['cannot', 'connect', 'cat', 'c   o  t', 'ct']

p = re.compile('c\S*t'); p.findall('cannot connect that to ct my cat c   o  t') #match c followed by any non-white-space character, repeating 0 or more times    
Out[68]: ['cannot', 'connect', 'cat', 'ct']

# + behaves similirar to * only that match 1 or more retitions 
p = re.compile('c.+t'); p.findall('cannot connect that to ct my cat c   o  t') #match c followed by any character, repeating 1 or more times    
Out[74]: ['cannot connect that to ct my cat c   o  t']


# ? matchs 0 or 1 times
p = re.compile('c.?t'); p.findall('cannot connect that to ct my cat c   o  t')
Out[75]: ['ct', 'ct', 'cat']

# {m,n}, where m and n are decimal integers, and means there must be at least m repetitions, and at most n. Either m or n can be ommited.
p = re.compile('c.{4,6}t'); p.findall('cannot connect that to ct my cat c   o  t') #any characters, appering 4 to 6 times, between c nd t
Out[81]: ['cannot', 'connect', 'c   o  t']


# RAW STRINGS
# before in this document I wrote something like p=re.compile(r'\\d') instead of simply p=re.compile('\\d')
# what that r does is to make turn the string to raw.
#so normal \d will be a metacharacter selecting decimal ints
p = re.compile("\\d"); p.findall('thi5 1s a 7es7 do \d')
Out[101]: ['5', '1', '7', '7']

p = re.compile(r"\\d"); p.findall('thi5 1s a 7es7 do \d')
Out[101]: ['5', '1', '7', '7']


p = re.compile('\d'); p.findall('thi5 1s a 7es7 do \d')
Out[87]: ['\\d']



# MATCHES
# also you might have noticed I kept on using findall, to get the pattern matching each re
# how ever that is just one possible method

#match()      Determines if the RE matches at the beginning of the string.
#search()     Scans through a string, looking for any location where this RE matches.
#findall()    Finds all substrings where the RE matches, and returns them as a list.
#finditer()   Finds all substrings where the RE matches, and returns them as an iterator. 


p = re.compile("this"); p.match('this is something')
Out[109]: <_sre.SRE_Match at 0x7f2990038bf8>
p = re.compile("this"); p.match('is this something?')
#returns nothing

re.search("this", 'this is something')
Out[113]: <_sre.SRE_Match at 0x7f29977ce030> #returns position
re.search("this", 'is this something')
Out[114]: <_sre.SRE_Match at 0x7f2990038cc8> #returns position
re.search("this", 'is something')
#returns nothing


#word boundary (in MORE METACHARACTERS)