# [Beautiful Soup Library](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) ## Making the Soup ```py from bs4 import BeautifulSoup import requests import lxml # better html parser than built-in response = requests.get("url") # retrieve a web page soup = BeautifulSoup(response.text, "html.parser") # parse HTML from response w/ python default HTML parser soup = BeautifulSoup(response.text, "lxml") # parse HTML from response w/ lxml parser soup.prettify() # prettify parsed HTML for display ``` ## Kinds of Objects Beautiful Soup transforms a complex HTML document into a complex tree of Python objects. ### Tag A Tag object corresponds to an XML or HTML tag in the original document ```py soup = BeautifulSoup('Extremely bold', 'html.parser') # parse HTML/XML tag = soup.b type(tag) # print(tag) # Extremely bold tag.name # tag name tag["attribute"] # access to tag attribute values tag.attrs # dict of attribue-value pairs ``` ### Navigable String A string corresponds to a bit of text within a tag. Beautiful Soup uses the `NavigableString` class to contain these bits of text. ## Navigating the Tree ### Going Down ```py soup.. # navigate using tag names .contents # direct children as a list .children # direct children as a generator for iteration .descendants # iterator over all children, recursive .string # tag contents, does not have further children # If a tag's only child is another tag, and that tag has a .string, then the parent tag is considered to have the same .string as its child # If a tag contains more than one thing, then it's not clear what .string should refer to, so .string is defined to be None .strings # generator to iterate over all children's strings (will list white space) .stripped_strings # generator to iterate over all children's strings (will NOT list white space) ``` ### Going Up ```py .parent # tags direct parent (BeautifulSoup has parent None, html has parent BeautifulSoup) .parents # iterable over all parents ``` ### Going Sideways ```py .previous_sibling .next_sibling .previous_siblings .next_siblings ``` ### Going Back and Forth ```py .previous_element # whatever was parsed immediately before .next_element # whatever was parsed immediately afterwards .previous_elements # whatever was parsed immediately before as a list .next_elements # whatever was parsed immediately afterwards as a list ``` ## Searching the Tree ## Filter Types ```py soup.find_all("tag") # by name soup.find_all(["tag1", "tag2"]) # multiple tags in a list soup.find_all(function) # based on a bool function soup.find_all(True) # Match everything ``` ## Methods Methods arguments: - `name` (string): tag to search for - `attrs` (dict): attribute-value pai to search for - `string` (string): search by string contents rather than by tag - `limit` (int). limit number of results - `**kwargs`: be turned into a filter on one of a tag's attributes. ```py find_all(name, attrs, recursive, string, limit, **kwargs) # several results find(name, attrs, recursive, string, **kwargs) # one result find_parents(name, attrs, string, limit, **kwargs) # several results find_parent(name, attrs, string, **kwargs) # one result find_next_siblings(name, attrs, string, limit, **kwargs) # several results find_next_sibling(name, attrs, string, **kwargs) # one result find_previous_siblings(name, attrs, string, limit, **kwargs) # several results find_previous_sibling(name, attrs, string, **kwargs) # one result find_all_next(name, attrs, string, limit, **kwargs) # several results find_next(name, attrs, string, **kwargs) # one result find_all_previous(name, attrs, string, limit, **kwargs) # several results find_previous(name, attrs, string, **kwargs) # one result soup("html_tag") # same as soup.find_all("html_tag") soup.find("html_tag").text # text of the found tag soup.select("css_selector") # search for CSS selectors of HTML tags ``` ## Modifying the Tree ### Changing Tag Names an Attributes ```py .name = "new_html_tag" # modify the tag type ["attribute"] = "value" # modify the attribute value del ["attribute"] # remove the attribute soup.new_tag("name", = "value") # create a new tag with specified name and attributes .string = "new content" # modify tag text content .append(item) # append to Tag content .extend([item1, item2]) # add every element of the list in order .insert(position: int, item) # like .insert in Python list .insert_before(new_tag) # insert tags or strings immediately before something else in the parse tree .insert_after(new_tag) # insert tags or strings immediately before something else in the parse tree .clear() # remove all tag's contents .extract() # extract and return the tag from the tree (operates on self) .string.extract() # extract and return the string from the tree (operates on self) .decompose() # remove a tag from the tree, then completely destroy it and its contents .decomposed # check if tag has be decomposed .replace_with(item) # remove a tag or string from the tree, and replaces it with the tag or string of choice .wrap(other_tag) # wrap an element in the tag you specify, return the new wrapper .unwrap() # replace a tag with whatever's inside, good for stripping out markup .smooth() # clean up the parse tree by consolidating adjacent strings ```