Latest revision as of 23:55, 7 February 2013

Regular expression in Python

A great introduction can be found here http://docs.python.org/2/howto/regex.html#regex-howto
In a nutshell re tries to match certain pattern within a string
pattern

import re #import regular expression module

### compile a pattern ###
p = re.compile('This is'); #define the pattern you are looking for
p.findall('This is something great. No! This is actually shit!') #look for all occurences
Out[35]: ['This is', 'This is']
# but this quite literal, not really taking advantage of regexp power
# for that we can introduce metacharacters . ^ $ * + ? { } [ ] \ | ( )

#lets go through them individually
# [] - character class - which a set of characters inside the []
p = re.compile(r'[is]'); 
p.findall('This is something great. No! This is actually shit!')
Out[37]: ['i', 's', 'i', 's', 's', 'i', 'i', 's', 'i', 's', 's', 'i'] #matched all the i and s characters 

#we can also use character class with ranges inside
p = re.compile(r'[a-d]'); # match all the characters in the range from a to d
p.findall('This is something great. No! This is actually shit!') 
Out[40]: ['a', 'a', 'c', 'a']

#or numbers
p = re.compile(r'[0-5]'); 
p.findall('Th1s i5 s6methin9 9reat. N0! Th15 is actua117 5h17!')
Out[41]: ['1', '5', '0', '1', '5', '1', '1', '5', '1']

#or combine it with the not/exclude ^ metacharacter
p = re.compile(r'[^a-z]'); p.findall('This is something great. No! This is actually shit!') #match everything besides lowercase letter from a to z
Out[45]: ['T', ' ', ' ', ' ', '.', ' ', 'N', '!', ' ', 'T', ' ', ' ', ' ', '!']

#however if you want to select a character that is in a string but also happens to be a metacharacter
#such as select the ^
p = re.compile(r'^'); p.findall('This ^^ is ^^^ something^ greatl')
Out[46]: [''] # nothing was selecter, because re is taking ^ as a metacharater
# we need to escape it, by using \ the ESCAPE the character,
# which will then take ^ as a LITERAL 
p = re.compile(r'\^'); p.findall('This ^^ is ^^^ something^ greatl')
Out[47]: ['^', '^', '^', '^', '^', '^']

#be wary of the \ if follow by a letter, such as in: \d \D \W \w \S \s , or in other words a SEQUENCE
#a SEQUENCE represent A predefined set of characters that are often used

# \d decimal digit
# \D non-decimal digit
# \w alphanumeric character [a-zA-Z0-9_]
# \W non-alphanumeric character [^a-zA-Z0-9_]
# \s white-space character
# \S non-whitespace character
# . any character

#some examples
p = re.compile(r'\d'); p.findall('This is something great1') #get all decimal digitis
Out[50]: ['1']

# sequences can be combined with classes 
p = re.compile(r'[\W, t]'); p.findall('This is something great. No! This is actually shit!') #non-alphanumeric characters and i  
Out[83]: [' ', ' ', 't', ' ', 't', '.', ' ', '!', ' ', ' ', ' ', 't', ' ', 't', '!']

#NOTICE CASE SENSITIVITY
# only the lowercase t, as we indicated, is being selected. If we want both lower and upper case to be matched need to indicated it
Out[83]: [' ', ' ', 't', ' ', 't', '.', ' ', '!', ' ', ' ', ' ', 't', ' ', 't', '!']

p = re.compile('[\W, t]', re.IGNORECASE); p.findall('This is something great. No! This is actually shit!')
Out[86]:  ['T',  ' ',  ' ',  't',  ' ',  't',  '.',  ' ',  '!',  ' ',  'T',  ' ',  ' ',  't',  ' ',  't',  '!']   

# sequences can also be combined with simple character
p = re.compile(r'i..'); p.findall('This is something great. No! This is actually shit!') #i and the two following characters
Out[24]: ['is ', 'is ', 'ing', 'is ', 'is ', 'it!'] 

p = re.compile(r'i\S\S'); p.findall('This is something great. No! This is actually shit!')#i followed by two non-white-space characters
Out[28]: ['ing', 'it!']


# the () means get only what is between () # MATCH GROUP
p = re.compile(r'i(\S\S)'); p.findall('This is something great. No! This is actually shit!') #the two non-white-space characters the follow i; i will not be included
Out[31]: ['ng', 't!']


# REPITITIONS
# RE can also specifiy the number of times a pattern can be matched

# * matches zero or more repititions of that pattern

p = re.compile('c.*?t'); p.findall('cannot connect that to my cat c   o  t ct') #match c followed by any character, repeating 0 or more times
Out[71]: ['cannot', 'connect', 'cat', 'c   o  t', 'ct']

p = re.compile('c\S*t'); p.findall('cannot connect that to ct my cat c   o  t') #match c followed by any non-white-space character, repeating 0 or more times    
Out[68]: ['cannot', 'connect', 'cat', 'ct']

# + behaves similirar to * only that match 1 or more retitions 
p = re.compile('c.+t'); p.findall('cannot connect that to ct my cat c   o  t') #match c followed by any character, repeating 1 or more times    
Out[74]: ['cannot connect that to ct my cat c   o  t']


# ? matchs 0 or 1 times
p = re.compile('c.?t'); p.findall('cannot connect that to ct my cat c   o  t')
Out[75]: ['ct', 'ct', 'cat']

# {m,n}, where m and n are decimal integers, and means there must be at least m repetitions, and at most n. Either m or n can be ommited.
p = re.compile('c.{4,6}t'); p.findall('cannot connect that to ct my cat c   o  t') #any characters, appering 4 to 6 times, between c nd t
Out[81]: ['cannot', 'connect', 'c   o  t']


# RAW STRINGS
# before in this document I wrote something like p=re.compile(r'\\d') instead of simply p=re.compile('\\d')
# what that r does is to make turn the string to raw.
#so normal \d will be a metacharacter selecting decimal ints
p = re.compile("\\d"); p.findall('thi5 1s a 7es7 do \d')
Out[101]: ['5', '1', '7', '7']

p = re.compile(r"\\d"); p.findall('thi5 1s a 7es7 do \d')
Out[101]: ['5', '1', '7', '7']


p = re.compile('\d'); p.findall('thi5 1s a 7es7 do \d')
Out[87]: ['\\d']



# MATCHES
# also you might have noticed I kept on using findall, to get the pattern matching each re
# how ever that is just one possible method

#match()      Determines if the RE matches at the beginning of the string.
#search()     Scans through a string, looking for any location where this RE matches.
#findall()    Finds all substrings where the RE matches, and returns them as a list.
#finditer()   Finds all substrings where the RE matches, and returns them as an iterator. 


p = re.compile("this"); p.match('this is something')
Out[109]: <_sre.SRE_Match at 0x7f2990038bf8>
p = re.compile("this"); p.match('is this something?')
#returns nothing

re.search("this", 'this is something')
Out[113]: <_sre.SRE_Match at 0x7f29977ce030> #returns position
re.search("this", 'is this something')
Out[114]: <_sre.SRE_Match at 0x7f2990038cc8> #returns position
re.search("this", 'is something')
#returns nothing


#word boundary (in MORE METACHARACTERS)