Revision as of 22:54, 7 February 2013

Regular expression in Python

A great introduction can be found here http://docs.python.org/2/howto/regex.html#regex-howto

In a nutshell re tries to match certain pattern within a string

pattern

compile a pattern

import re #import regular expression module
p = re.compile('This is'); #define the pattern you are looking for
p.findall('This is something great. No! This is actually shit!') #look for all occurences
Out[35]: ['This is', 'This is']
# but this quite literal, not really taking advantage of regexp power
# for that we can introduce metacharacters . ^ $ * + ? { } [ ] \ | ( )

#lets go through them individually
# [] - character class - which a set of characters inside the []
p = re.compile(r'[is]'); 
p.findall('This is something great. No! This is actually shit!')
Out[37]: ['i', 's', 'i', 's', 's', 'i', 'i', 's', 'i', 's', 's', 'i'] #matched all the i and s characters 

#we can also use character class with ranges inside
p = re.compile(r'[a-d]'); # match all the characters in the range from a to d
p.findall('This is something great. No! This is actually shit!') 
Out[40]: ['a', 'a', 'c', 'a']

#or numbers
p = re.compile(r'[0-5]'); 
p.findall('Th1s i5 s6methin9 9reat. N0! Th15 is actua117 5h17!')
Out[41]: ['1', '5', '0', '1', '5', '1', '1', '5', '1']

#or combine it with the not/exclude ^ metacharacter
p = re.compile(r'[^a-z]'); p.findall('This is something great. No! This is actually shit!') #match everything besides lowercase letter from a to z
Out[45]: ['T', ' ', ' ', ' ', '.', ' ', 'N', '!', ' ', 'T', ' ', ' ', ' ', '!']

#however if you want to select a character that is in a string but also happens to be a metacharacter
#such as select the ^
p = re.compile(r'^'); p.findall('This ^^ is ^^^ something^ greatl')
Out[46]: [''] # nothing was selecter, because re is taking ^ as a metacharater
# we need to escape it, by using \ the ESCAPE the character,
# which will then take ^ as a LITERAL 
p = re.compile(r'\^'); p.findall('This ^^ is ^^^ something^ greatl')
Out[47]: ['^', '^', '^', '^', '^', '^']

#be wary of the \ if follow by a letter, such as in: \d \D \W \w \S \s , or in other words a SEQUENCE
#a SEQUENCE represent A predefined set of characters that are often used

# \d decimal digit
# \D non-decimal digit
# \w alphanumeric character [a-zA-Z0-9_]
# \W non-alphanumeric character [^a-zA-Z0-9_]
# \s white-space character
# \S non-whitespace character
# . any character

#some examples
p = re.compile(r'\d'); p.findall('This is something great1') #get all decimal digitis
Out[50]: ['1']

# sequences can be combined with classes 
p = re.compile(r'[\W, t]'); p.findall('This is something great. No! This is actually shit!') #non-alphanumeric characters and i  
Out[83]: [' ', ' ', 't', ' ', 't', '.', ' ', '!', ' ', ' ', ' ', 't', ' ', 't', '!']

#NOTICE CASE SENSITIVITY
# only the lowercase t, as we indicated, is being selected. If we want both lower and upper case to be matched need to indicated it
Out[83]: [' ', ' ', 't', ' ', 't', '.', ' ', '!', ' ', ' ', ' ', 't', ' ', 't', '!']

p = re.compile('[\W, t]', re.IGNORECASE); p.findall('This is something great. No! This is actually shit!')
Out[86]:  ['T',  ' ',  ' ',  't',  ' ',  't',  '.',  ' ',  '!',  ' ',  'T',  ' ',  ' ',  't',  ' ',  't',  '!']   

# sequences can also be combined with simple character
p = re.compile(r'i..'); p.findall('This is something great. No! This is actually shit!') #i and the two following characters
Out[24]: ['is ', 'is ', 'ing', 'is ', 'is ', 'it!'] 

p = re.compile(r'i\S\S'); p.findall('This is something great. No! This is actually shit!')#i followed by two non-white-space characters
Out[28]: ['ing', 'it!']


# the () means get only what is between () # MATCH GROUP
p = re.compile(r'i(\S\S)'); p.findall('This is something great. No! This is actually shit!') #the two non-white-space characters the follow i; i will not be included
Out[31]: ['ng', 't!']


# REPITITIONS
# RE can also specifiy the number of times a pattern can be matched

# * matches zero or more repititions of that pattern

p = re.compile('c.*?t'); p.findall('cannot connect that to my cat c   o  t ct') #match c followed by any character, repeating 0 or more times
Out[71]: ['cannot', 'connect', 'cat', 'c   o  t', 'ct']

p = re.compile('c\S*t'); p.findall('cannot connect that to ct my cat c   o  t') #match c followed by any non-white-space character, repeating 0 or more times    
Out[68]: ['cannot', 'connect', 'cat', 'ct']

# + behaves similirar to * only that match 1 or more retitions 
p = re.compile('c.+t'); p.findall('cannot connect that to ct my cat c   o  t') #match c followed by any character, repeating 1 or more times    
Out[74]: ['cannot connect that to ct my cat c   o  t']


# ? matchs 0 or 1 times
p = re.compile('c.?t'); p.findall('cannot connect that to ct my cat c   o  t')
Out[75]: ['ct', 'ct', 'cat']

# {m,n}, where m and n are decimal integers, and means there must be at least m repetitions, and at most n. Either m or n can be ommited.
p = re.compile('c.{4,6}t'); p.findall('cannot connect that to ct my cat c   o  t') #any characters, appering 4 to 6 times, between c nd t
Out[81]: ['cannot', 'connect', 'c   o  t']


# RAW STRINGS
# before in this document I wrote something like p=re.compile(r'\\d') instead of simply p=re.compile('\\d')
# what that r does is to make turn the string to raw.
#so normal \d will be a metacharacter selecting decimal ints
p = re.compile("\\d"); p.findall('thi5 1s a 7es7 do \d')
Out[101]: ['5', '1', '7', '7']

p = re.compile(r"\\d"); p.findall('thi5 1s a 7es7 do \d')
Out[101]: ['5', '1', '7', '7']


p = re.compile('\d'); p.findall('thi5 1s a 7es7 do \d')
Out[87]: ['\\d']



# MATCHES
# also you might have noticed I kept on using findall, to get the pattern matching each re
# how ever that is just one possible method

#match()      Determines if the RE matches at the beginning of the string.
#search()     Scans through a string, looking for any location where this RE matches.
#findall()    Finds all substrings where the RE matches, and returns them as a list.
#finditer()   Finds all substrings where the RE matches, and returns them as an iterator. 


p = re.compile("this"); p.match('this is something')
Out[109]: <_sre.SRE_Match at 0x7f2990038bf8>
p = re.compile("this"); p.match('is this something?')
#returns nothing

re.search("this", 'this is something')
Out[113]: <_sre.SRE_Match at 0x7f29977ce030> #returns position
re.search("this", 'is this something')
Out[114]: <_sre.SRE_Match at 0x7f2990038cc8> #returns position
re.search("this", 'is something')
#returns nothing


#word boundary (in MORE METACHARACTERS)

@@ Line 1: / Line 1: @@
 =Regular expression in Python=
-A great introdution can be found here http://docs.python.org/2/howto/regex.html#regex-howto
+A great introduction can be found here http://docs.python.org/2/howto/regex.html#regex-howto
-But succitaly the re try to match certain pattern in a string or in words within strings
+In a nutshell '''re''' tries to match certain pattern within a string
 ==pattern==
-# compile a pattern
+===compile a pattern===
+<source lang=python>
 import re #import regular expression module
 p = re.compile('This is'); #define the pattern you are looking for
-p.findall('This is something great. No! This is actually shit!') #look for all occurances
+p.findall('This is something great. No! This is actually shit!') #look for all occurences
 Out[35]: ['This is', 'This is']
-# but this quite literal, not realy taking advantage of regexp power
+# but this quite literal, not really taking advantage of regexp power
 # for that we can introduce metacharacters . ^ $ * + ? { } [ ] \ | ( )
-#lets go throught them individualy
+#lets go through them individually
 # [] - character class - which a set of characters inside the []
 p = re.compile(r'[is]');
@@ Line 46: / Line 46: @@
 #a SEQUENCE represent A predefined set of characters that are often used
-# \d decima digit
+# \d decimal digit
 # \D non-decimal digit
 # \w alphanumeric character [a-zA-Z0-9_]
@@ Line 149: / Line 149: @@
+</source>
-----
-In [121]: pattern = re.compile(r'.') #match any chacter except a new line
-In [122]: pattern.findall('A long summer')
-Out[122]: ['A', ' ', 'l', 'o', 'n', 'g', ' ', 's', 'u', 'm', 'm', 'e', 'r']
-In [153]: pattern = re.compile(r'This .')# matches only 1 characted
-In [154]: pattern.findall('This stuff is great')
-Out[154]: ['This s']
-In [177]: pattern = re.compile(r'This (.)') #it makes a group
-In [178]: pattern.findall('This is not that great')
-Out[178]: ['i']
-In [169]: pattern = re.compile(r'^T')
-In [170]: pattern.findall('This stuff is great')
-Out[170]: ['T']
-                          #$ end of line. (Not understard)
-In [201]: pattern = re.compile(r'[is]') #[] a set o characters to be matched
-In [202]: pattern.findall('This is not that great\nBut in a new line.')
-Out[202]: ['i', 's', 'i', 's', 'i', 'i']
-In [205]: pattern.findall('This is not that great\nBut in a new line.')
-Out[205]: ['is', 'is', 'in', 'in']
-In [211]: pattern = re.compile(r'\w') #matches any alphanumeric character and the underscore
-In [212]: pattern.findall('This is not that great\nBut in a new line.')
-Out[212]:
-['T',
- 'h',
- 'i',
- 's',
- 'i',
- 's',
- 'n',
- 'o',
- 't',
- 't',
- 'h',
- 'a',
- 't',
- 'g',
- 'r',
- 'e',
- 'a',
- 't',
- 'B',
- 'u',
- 't',
- 'i',
- 'n',
- 'a',
- 'n',
- 'e',
- 'w',
- 'l',
- 'i',
- 'n',
- 'e']
-In [213]: pattern = re.compile(r'\w\w')
-In [214]: pattern.findall('This is not that great\nBut in a new line.')
-Out[214]: ['Th', 'is', 'is', 'no', 'th', 'at', 'gr', 'ea', 'Bu', 'in', 'ne', 'li', 'ne']
-In [215]: pattern = re.compile(r'\W') #matches any non-alphanumeric character
-In [216]: pattern.findall('This is not that great\nBut in a new line.')
-Out[216]: [' ', ' ', ' ', ' ', '\n', ' ', ' ', ' ', ' ', '.']
-'*' match 0 or more repetitions of the preceding RE
-See the difference:
-In [48]: pattern = re.compile(r'This is (.)')
-In [49]: pattern.findall('This is something great')
-Out[49]: ['s']
-In [47]: p=r'This is (.*)'
-In [48]: pattern = re.compile(p)
-In [49]: pattern.findall(s)
-Out[50]: ['something great. No!!']