Donate to support Ukraine's independence.

24 May'11

BeautifulSoup Parsing

1. Import dependencies

import urllib
from BeautifulSoup import BeautifulSoup

2. Settings

site = "http://****.in.ua"
base = site + "/url/****.html"
parse_urls = ["?page=1",]
parsed = []
urls = []

3. prepopulate urls bank with paging

def parser(fun):
    element = parse_urls.pop()
    parsed.append(element)
    page = urllib.urlopen(base + element)
    soup = BeautifulSoup(page.read())
    for topic in soup.findAll(True, 'right_block'):
        urls.append(topic.p.a["href"])
    for link in soup.find(id="page_list").findAll('li'):
        if (link.a["href"] not in parse_urls and link.a["href"] not in parsed):
            parse_urls.append(link.a["href"])

while(len(parse_urls) != 0):
    parser(blog_parse)

4. Parse

pages = []
for url …

Continue reading