Package gozerbot :: Package contrib :: Module BeautifulSoup
[hide private]
[frames] | no frames]

Source Code for Module gozerbot.contrib.BeautifulSoup

   1  """Beautiful Soup 
   2  Elixir and Tonic 
   3  "The Screen-Scraper's Friend" 
   4  http://www.crummy.com/software/BeautifulSoup/ 
   5   
   6  Beautiful Soup parses a (possibly invalid) XML or HTML document into a 
   7  tree representation. It provides methods and Pythonic idioms that make 
   8  it easy to navigate, search, and modify the tree. 
   9   
  10  A well-formed XML/HTML document yields a well-formed data 
  11  structure. An ill-formed XML/HTML document yields a correspondingly 
  12  ill-formed data structure. If your document is only locally 
  13  well-formed, you can use this library to find and process the 
  14  well-formed part of it. 
  15   
  16  Beautiful Soup works with Python 2.2 and up. It has no external 
  17  dependencies, but you'll have more success at converting data to UTF-8 
  18  if you also install these three packages: 
  19   
  20  * chardet, for auto-detecting character encodings 
  21    http://chardet.feedparser.org/ 
  22  * cjkcodecs and iconv_codec, which add more encodings to the ones supported 
  23    by stock Python. 
  24    http://cjkpython.i18n.org/ 
  25   
  26  Beautiful Soup defines classes for two main parsing strategies: 
  27   
  28   * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific 
  29     language that kind of looks like XML. 
  30   
  31   * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid 
  32     or invalid. This class has web browser-like heuristics for 
  33     obtaining a sensible parse tree in the face of common HTML errors. 
  34   
  35  Beautiful Soup also defines a class (UnicodeDammit) for autodetecting 
  36  the encoding of an HTML or XML document, and converting it to 
  37  Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. 
  38   
  39  For more than you ever wanted to know about Beautiful Soup, see the 
  40  documentation: 
  41  http://www.crummy.com/software/BeautifulSoup/documentation.html 
  42   
  43  Here, have some legalese: 
  44   
  45  Copyright (c) 2004-2007, Leonard Richardson 
  46   
  47  All rights reserved. 
  48   
  49  Redistribution and use in source and binary forms, with or without 
  50  modification, are permitted provided that the following conditions are 
  51  met: 
  52   
  53    * Redistributions of source code must retain the above copyright 
  54      notice, this list of conditions and the following disclaimer. 
  55   
  56    * Redistributions in binary form must reproduce the above 
  57      copyright notice, this list of conditions and the following 
  58      disclaimer in the documentation and/or other materials provided 
  59      with the distribution. 
  60   
  61    * Neither the name of the the Beautiful Soup Consortium and All 
  62      Night Kosher Bakery nor the names of its contributors may be 
  63      used to endorse or promote products derived from this software 
  64      without specific prior written permission. 
  65   
  66  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  67  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  68  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
  69  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 
  70  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  71  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  72  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
  73  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
  74  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
  75  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
  76  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. 
  77   
  78  """ 
  79  from __future__ import generators 
  80   
  81  __author__ = "Leonard Richardson (leonardr@segfault.org)" 
  82  __version__ = "3.0.5" 
  83  __copyright__ = "Copyright (c) 2004-2007 Leonard Richardson" 
  84  __license__ = "New-style BSD" 
  85   
  86  from sgmllib import SGMLParser, SGMLParseError 
  87  import codecs 
  88  import types 
  89  import re 
  90  import sgmllib 
  91  try: 
  92    from htmlentitydefs import name2codepoint 
  93  except ImportError: 
  94    name2codepoint = {} 
  95   
  96  #This hack makes Beautiful Soup able to parse XML with namespaces 
  97  sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') 
  98   
  99  DEFAULT_OUTPUT_ENCODING = "utf-8" 
 100   
 101  # First, the classes that represent markup elements. 
 102   
103 -class PageElement:
104 """Contains the navigational information for some part of the page 105 (either a tag or a piece of text)""" 106
107 - def setup(self, parent=None, previous=None):
108 """Sets up the initial relations between this element and 109 other elements.""" 110 self.parent = parent 111 self.previous = previous 112 self.next = None 113 self.previousSibling = None 114 self.nextSibling = None 115 if self.parent and self.parent.contents: 116 self.previousSibling = self.parent.contents[-1] 117 self.previousSibling.nextSibling = self
118
119 - def replaceWith(self, replaceWith):
120 oldParent = self.parent 121 myIndex = self.parent.contents.index(self) 122 if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: 123 # We're replacing this element with one of its siblings. 124 index = self.parent.contents.index(replaceWith) 125 if index and index < myIndex: 126 # Furthermore, it comes before this element. That 127 # means that when we extract it, the index of this 128 # element will change. 129 myIndex = myIndex - 1 130 self.extract() 131 oldParent.insert(myIndex, replaceWith)
132
133 - def extract(self):
134 """Destructively rips this element out of the tree.""" 135 if self.parent: 136 try: 137 self.parent.contents.remove(self) 138 except ValueError: 139 pass 140 141 #Find the two elements that would be next to each other if 142 #this element (and any children) hadn't been parsed. Connect 143 #the two. 144 lastChild = self._lastRecursiveChild() 145 nextElement = lastChild.next 146 147 if self.previous: 148 self.previous.next = nextElement 149 if nextElement: 150 nextElement.previous = self.previous 151 self.previous = None 152 lastChild.next = None 153 154 self.parent = None 155 if self.previousSibling: 156 self.previousSibling.nextSibling = self.nextSibling 157 if self.nextSibling: 158 self.nextSibling.previousSibling = self.previousSibling 159 self.previousSibling = self.nextSibling = None
160
161 - def _lastRecursiveChild(self):
162 "Finds the last element beneath this object to be parsed." 163 lastChild = self 164 while hasattr(lastChild, 'contents') and lastChild.contents: 165 lastChild = lastChild.contents[-1] 166 return lastChild
167
168 - def insert(self, position, newChild):
169 if (isinstance(newChild, basestring) 170 or isinstance(newChild, unicode)) \ 171 and not isinstance(newChild, NavigableString): 172 newChild = NavigableString(newChild) 173 174 position = min(position, len(self.contents)) 175 if hasattr(newChild, 'parent') and newChild.parent != None: 176 # We're 'inserting' an element that's already one 177 # of this object's children. 178 if newChild.parent == self: 179 index = self.find(newChild) 180 if index and index < position: 181 # Furthermore we're moving it further down the 182 # list of this object's children. That means that 183 # when we extract this element, our target index 184 # will jump down one. 185 position = position - 1 186 newChild.extract() 187 188 newChild.parent = self 189 previousChild = None 190 if position == 0: 191 newChild.previousSibling = None 192 newChild.previous = self 193 else: 194 previousChild = self.contents[position-1] 195 newChild.previousSibling = previousChild 196 newChild.previousSibling.nextSibling = newChild 197 newChild.previous = previousChild._lastRecursiveChild() 198 if newChild.previous: 199 newChild.previous.next = newChild 200 201 newChildsLastElement = newChild._lastRecursiveChild() 202 203 if position >= len(self.contents): 204 newChild.nextSibling = None 205 206 parent = self 207 parentsNextSibling = None 208 while not parentsNextSibling: 209 parentsNextSibling = parent.nextSibling 210 parent = parent.parent 211 if not parent: # This is the last element in the document. 212 break 213 if parentsNextSibling: 214 newChildsLastElement.next = parentsNextSibling 215 else: 216 newChildsLastElement.next = None 217 else: 218 nextChild = self.contents[position] 219 newChild.nextSibling = nextChild 220 if newChild.nextSibling: 221 newChild.nextSibling.previousSibling = newChild 222 newChildsLastElement.next = nextChild 223 224 if newChildsLastElement.next: 225 newChildsLastElement.next.previous = newChildsLastElement 226 self.contents.insert(position, newChild)
227
228 - def append(self, tag):
229 """Appends the given tag to the contents of this tag.""" 230 self.insert(len(self.contents), tag)
231
232 - def findNext(self, name=None, attrs={}, text=None, **kwargs):
233 """Returns the first item that matches the given criteria and 234 appears after this Tag in the document.""" 235 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
236
237 - def findAllNext(self, name=None, attrs={}, text=None, limit=None, 238 **kwargs):
239 """Returns all items that match the given criteria and appear 240 before after Tag in the document.""" 241 return self._findAll(name, attrs, text, limit, self.nextGenerator)
242
243 - def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
244 """Returns the closest sibling to this Tag that matches the 245 given criteria and appears after this Tag in the document.""" 246 return self._findOne(self.findNextSiblings, name, attrs, text, 247 **kwargs)
248
249 - def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, 250 **kwargs):
251 """Returns the siblings of this Tag that match the given 252 criteria and appear after this Tag in the document.""" 253 return self._findAll(name, attrs, text, limit, 254 self.nextSiblingGenerator, **kwargs)
255 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x 256
257 - def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
258 """Returns the first item that matches the given criteria and 259 appears before this Tag in the document.""" 260 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
261
262 - def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, 263 **kwargs):
264 """Returns all items that match the given criteria and appear 265 before this Tag in the document.""" 266 return self._findAll(name, attrs, text, limit, self.previousGenerator, 267 **kwargs)
268 fetchPrevious = findAllPrevious # Compatibility with pre-3.x 269
270 - def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
271 """Returns the closest sibling to this Tag that matches the 272 given criteria and appears before this Tag in the document.""" 273 return self._findOne(self.findPreviousSiblings, name, attrs, text, 274 **kwargs)
275
276 - def findPreviousSiblings(self, name=None, attrs={}, text=None, 277 limit=None, **kwargs):
278 """Returns the siblings of this Tag that match the given 279 criteria and appear before this Tag in the document.""" 280 return self._findAll(name, attrs, text, limit, 281 self.previousSiblingGenerator, **kwargs)
282 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x 283
284 - def findParent(self, name=None, attrs={}, **kwargs):
285 """Returns the closest parent of this Tag that matches the given 286 criteria.""" 287 # NOTE: We can't use _findOne because findParents takes a different 288 # set of arguments. 289 r = None 290 l = self.findParents(name, attrs, 1) 291 if l: 292 r = l[0] 293 return r
294
295 - def findParents(self, name=None, attrs={}, limit=None, **kwargs):
296 """Returns the parents of this Tag that match the given 297 criteria.""" 298 299 return self._findAll(name, attrs, None, limit, self.parentGenerator, 300 **kwargs)
301 fetchParents = findParents # Compatibility with pre-3.x 302 303 #These methods do the real heavy lifting. 304
305 - def _findOne(self, method, name, attrs, text, **kwargs):
306 r = None 307 l = method(name, attrs, text, 1, **kwargs) 308 if l: 309 r = l[0] 310 return r
311
312 - def _findAll(self, name, attrs, text, limit, generator, **kwargs):
313 "Iterates over a generator looking for things that match." 314 315 if isinstance(name, SoupStrainer): 316 strainer = name 317 else: 318 # Build a SoupStrainer 319 strainer = SoupStrainer(name, attrs, text, **kwargs) 320 results = ResultSet(strainer) 321 g = generator() 322 while True: 323 try: 324 i = g.next() 325 except StopIteration: 326 break 327 if i: 328 found = strainer.search(i) 329 if found: 330 results.append(found) 331 if limit and len(results) >= limit: 332 break 333 return results
334 335 #These Generators can be used to navigate starting from both 336 #NavigableStrings and Tags.
337 - def nextGenerator(self):
338 i = self 339 while i: 340 i = i.next 341 yield i
342
343 - def nextSiblingGenerator(self):
344 i = self 345 while i: 346 i = i.nextSibling 347 yield i
348
349 - def previousGenerator(self):
350 i = self 351 while i: 352 i = i.previous 353 yield i
354
355 - def previousSiblingGenerator(self):
356 i = self 357 while i: 358 i = i.previousSibling 359 yield i
360
361 - def parentGenerator(self):
362 i = self 363 while i: 364 i = i.parent 365 yield i
366 367 # Utility methods
368 - def substituteEncoding(self, str, encoding=None):
369 encoding = encoding or "utf-8" 370 return str.replace("%SOUP-ENCODING%", encoding)
371
372 - def toEncoding(self, s, encoding=None):
373 """Encodes an object to a string in some encoding, or to Unicode. 374 .""" 375 if isinstance(s, unicode): 376 if encoding: 377 s = s.encode(encoding) 378 elif isinstance(s, str): 379 if encoding: 380 s = s.encode(encoding) 381 else: 382 s = unicode(s) 383 else: 384 if encoding: 385 s = self.toEncoding(str(s), encoding) 386 else: 387 s = unicode(s) 388 return s
389 412
413 -class CData(NavigableString):
414
415 - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
416 return "<![CDATA[%s]]>" %