1 """Beautiful Soup
2 Elixir and Tonic
3 "The Screen-Scraper's Friend"
4 http://www.crummy.com/software/BeautifulSoup/
5
6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 tree representation. It provides methods and Pythonic idioms that make
8 it easy to navigate, search, and modify the tree.
9
10 A well-formed XML/HTML document yields a well-formed data
11 structure. An ill-formed XML/HTML document yields a correspondingly
12 ill-formed data structure. If your document is only locally
13 well-formed, you can use this library to find and process the
14 well-formed part of it.
15
16 Beautiful Soup works with Python 2.2 and up. It has no external
17 dependencies, but you'll have more success at converting data to UTF-8
18 if you also install these three packages:
19
20 * chardet, for auto-detecting character encodings
21 http://chardet.feedparser.org/
22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
23 by stock Python.
24 http://cjkpython.i18n.org/
25
26 Beautiful Soup defines classes for two main parsing strategies:
27
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 language that kind of looks like XML.
30
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 or invalid. This class has web browser-like heuristics for
33 obtaining a sensible parse tree in the face of common HTML errors.
34
35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 the encoding of an HTML or XML document, and converting it to
37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38
39 For more than you ever wanted to know about Beautiful Soup, see the
40 documentation:
41 http://www.crummy.com/software/BeautifulSoup/documentation.html
42
43 Here, have some legalese:
44
45 Copyright (c) 2004-2007, Leonard Richardson
46
47 All rights reserved.
48
49 Redistribution and use in source and binary forms, with or without
50 modification, are permitted provided that the following conditions are
51 met:
52
53 * Redistributions of source code must retain the above copyright
54 notice, this list of conditions and the following disclaimer.
55
56 * Redistributions in binary form must reproduce the above
57 copyright notice, this list of conditions and the following
58 disclaimer in the documentation and/or other materials provided
59 with the distribution.
60
61 * Neither the name of the the Beautiful Soup Consortium and All
62 Night Kosher Bakery nor the names of its contributors may be
63 used to endorse or promote products derived from this software
64 without specific prior written permission.
65
66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77
78 """
79 from __future__ import generators
80
81 __author__ = "Leonard Richardson (leonardr@segfault.org)"
82 __version__ = "3.0.5"
83 __copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
84 __license__ = "New-style BSD"
85
86 from sgmllib import SGMLParser, SGMLParseError
87 import codecs
88 import types
89 import re
90 import sgmllib
91 try:
92 from htmlentitydefs import name2codepoint
93 except ImportError:
94 name2codepoint = {}
95
96
97 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
98
99 DEFAULT_OUTPUT_ENCODING = "utf-8"
100
101
102
104 """Contains the navigational information for some part of the page
105 (either a tag or a piece of text)"""
106
107 - def setup(self, parent=None, previous=None):
108 """Sets up the initial relations between this element and
109 other elements."""
110 self.parent = parent
111 self.previous = previous
112 self.next = None
113 self.previousSibling = None
114 self.nextSibling = None
115 if self.parent and self.parent.contents:
116 self.previousSibling = self.parent.contents[-1]
117 self.previousSibling.nextSibling = self
118
119 - def replaceWith(self, replaceWith):
120 oldParent = self.parent
121 myIndex = self.parent.contents.index(self)
122 if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
123
124 index = self.parent.contents.index(replaceWith)
125 if index and index < myIndex:
126
127
128
129 myIndex = myIndex - 1
130 self.extract()
131 oldParent.insert(myIndex, replaceWith)
132
134 """Destructively rips this element out of the tree."""
135 if self.parent:
136 try:
137 self.parent.contents.remove(self)
138 except ValueError:
139 pass
140
141
142
143
144 lastChild = self._lastRecursiveChild()
145 nextElement = lastChild.next
146
147 if self.previous:
148 self.previous.next = nextElement
149 if nextElement:
150 nextElement.previous = self.previous
151 self.previous = None
152 lastChild.next = None
153
154 self.parent = None
155 if self.previousSibling:
156 self.previousSibling.nextSibling = self.nextSibling
157 if self.nextSibling:
158 self.nextSibling.previousSibling = self.previousSibling
159 self.previousSibling = self.nextSibling = None
160
162 "Finds the last element beneath this object to be parsed."
163 lastChild = self
164 while hasattr(lastChild, 'contents') and lastChild.contents:
165 lastChild = lastChild.contents[-1]
166 return lastChild
167
168 - def insert(self, position, newChild):
169 if (isinstance(newChild, basestring)
170 or isinstance(newChild, unicode)) \
171 and not isinstance(newChild, NavigableString):
172 newChild = NavigableString(newChild)
173
174 position = min(position, len(self.contents))
175 if hasattr(newChild, 'parent') and newChild.parent != None:
176
177
178 if newChild.parent == self:
179 index = self.find(newChild)
180 if index and index < position:
181
182
183
184
185 position = position - 1
186 newChild.extract()
187
188 newChild.parent = self
189 previousChild = None
190 if position == 0:
191 newChild.previousSibling = None
192 newChild.previous = self
193 else:
194 previousChild = self.contents[position-1]
195 newChild.previousSibling = previousChild
196 newChild.previousSibling.nextSibling = newChild
197 newChild.previous = previousChild._lastRecursiveChild()
198 if newChild.previous:
199 newChild.previous.next = newChild
200
201 newChildsLastElement = newChild._lastRecursiveChild()
202
203 if position >= len(self.contents):
204 newChild.nextSibling = None
205
206 parent = self
207 parentsNextSibling = None
208 while not parentsNextSibling:
209 parentsNextSibling = parent.nextSibling
210 parent = parent.parent
211 if not parent:
212 break
213 if parentsNextSibling:
214 newChildsLastElement.next = parentsNextSibling
215 else:
216 newChildsLastElement.next = None
217 else:
218 nextChild = self.contents[position]
219 newChild.nextSibling = nextChild
220 if newChild.nextSibling:
221 newChild.nextSibling.previousSibling = newChild
222 newChildsLastElement.next = nextChild
223
224 if newChildsLastElement.next:
225 newChildsLastElement.next.previous = newChildsLastElement
226 self.contents.insert(position, newChild)
227
228 - def append(self, tag):
229 """Appends the given tag to the contents of this tag."""
230 self.insert(len(self.contents), tag)
231
232 - def findNext(self, name=None, attrs={}, text=None, **kwargs):
233 """Returns the first item that matches the given criteria and
234 appears after this Tag in the document."""
235 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
236
237 - def findAllNext(self, name=None, attrs={}, text=None, limit=None,
238 **kwargs):
239 """Returns all items that match the given criteria and appear
240 before after Tag in the document."""
241 return self._findAll(name, attrs, text, limit, self.nextGenerator)
242
243 - def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
244 """Returns the closest sibling to this Tag that matches the
245 given criteria and appears after this Tag in the document."""
246 return self._findOne(self.findNextSiblings, name, attrs, text,
247 **kwargs)
248
249 - def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
250 **kwargs):
251 """Returns the siblings of this Tag that match the given
252 criteria and appear after this Tag in the document."""
253 return self._findAll(name, attrs, text, limit,
254 self.nextSiblingGenerator, **kwargs)
255 fetchNextSiblings = findNextSiblings
256
257 - def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
258 """Returns the first item that matches the given criteria and
259 appears before this Tag in the document."""
260 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
261
262 - def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
263 **kwargs):
264 """Returns all items that match the given criteria and appear
265 before this Tag in the document."""
266 return self._findAll(name, attrs, text, limit, self.previousGenerator,
267 **kwargs)
268 fetchPrevious = findAllPrevious
269
270 - def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
271 """Returns the closest sibling to this Tag that matches the
272 given criteria and appears before this Tag in the document."""
273 return self._findOne(self.findPreviousSiblings, name, attrs, text,
274 **kwargs)
275
276 - def findPreviousSiblings(self, name=None, attrs={}, text=None,
277 limit=None, **kwargs):
278 """Returns the siblings of this Tag that match the given
279 criteria and appear before this Tag in the document."""
280 return self._findAll(name, attrs, text, limit,
281 self.previousSiblingGenerator, **kwargs)
282 fetchPreviousSiblings = findPreviousSiblings
283
284 - def findParent(self, name=None, attrs={}, **kwargs):
285 """Returns the closest parent of this Tag that matches the given
286 criteria."""
287
288
289 r = None
290 l = self.findParents(name, attrs, 1)
291 if l:
292 r = l[0]
293 return r
294
295 - def findParents(self, name=None, attrs={}, limit=None, **kwargs):
296 """Returns the parents of this Tag that match the given
297 criteria."""
298
299 return self._findAll(name, attrs, None, limit, self.parentGenerator,
300 **kwargs)
301 fetchParents = findParents
302
303
304
305 - def _findOne(self, method, name, attrs, text, **kwargs):
306 r = None
307 l = method(name, attrs, text, 1, **kwargs)
308 if l:
309 r = l[0]
310 return r
311
312 - def _findAll(self, name, attrs, text, limit, generator, **kwargs):
313 "Iterates over a generator looking for things that match."
314
315 if isinstance(name, SoupStrainer):
316 strainer = name
317 else:
318
319 strainer = SoupStrainer(name, attrs, text, **kwargs)
320 results = ResultSet(strainer)
321 g = generator()
322 while True:
323 try:
324 i = g.next()
325 except StopIteration:
326 break
327 if i:
328 found = strainer.search(i)
329 if found:
330 results.append(found)
331 if limit and len(results) >= limit:
332 break
333 return results
334
335
336
337 - def nextGenerator(self):
338 i = self
339 while i:
340 i = i.next
341 yield i
342
344 i = self
345 while i:
346 i = i.nextSibling
347 yield i
348
350 i = self
351 while i:
352 i = i.previous
353 yield i
354
356 i = self
357 while i:
358 i = i.previousSibling
359 yield i
360
361 - def parentGenerator(self):
362 i = self
363 while i:
364 i = i.parent
365 yield i
366
367
368 - def substituteEncoding(self, str, encoding=None):
369 encoding = encoding or "utf-8"
370 return str.replace("%SOUP-ENCODING%", encoding)
371
372 - def toEncoding(self, s, encoding=None):
373 """Encodes an object to a string in some encoding, or to Unicode.
374 ."""
375 if isinstance(s, unicode):
376 if encoding:
377 s = s.encode(encoding)
378 elif isinstance(s, str):
379 if encoding:
380 s = s.encode(encoding)
381 else:
382 s = unicode(s)
383 else:
384 if encoding:
385 s = self.toEncoding(str(s), encoding)
386 else:
387 s = unicode(s)
388 return s
389
391
394
396 """text.string gives you text. This is for backwards
397 compatibility for Navigable*String, but for CData* it lets you
398 get the string without the CData wrapper."""
399 if attr == 'string':
400 return self
401 else:
402 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
403
405 return unicode(str(self))
406
408 if encoding:
409 return self.encode(encoding)
410 else:
411 return self
412
413 -class CData(NavigableString):
414
416 return "<![CDATA[%s]]>" %