Donate to support Ukraine's independence.

24 May'11

BeautifulSoup Parsing

1. Import dependencies

import urllib
from BeautifulSoup import BeautifulSoup

2. Settings

site = "http://****.in.ua"
base = site + "/url/****.html"
parse_urls = ["?page=1",]
parsed = []
urls = []

3. prepopulate urls bank with paging

def parser(fun):
    element = parse_urls.pop()
    parsed.append(element)
    page = urllib.urlopen(base + element)
    soup = BeautifulSoup(page.read())
    for topic in soup.findAll(True, 'right_block'):
        urls.append(topic.p.a["href"])
    for link in soup.find(id="page_list").findAll('li'):
        if (link.a["href"] not in parse_urls and link.a["href"] not in parsed):
            parse_urls.append(link.a["href"])

while(len(parse_urls) != 0):
    parser(blog_parse)

4. Parse

pages = []
for url in urls:
    page = urllib.urlopen(site + urllib.quote(url.encode("cp1251")))
    soup = BeautifulSoup(page.read())
    print url
    a = parse_item(soup)
    pages.append(a)

5. Populate Django DB ORM

def parse_item(soup):
    header = soup.find("div", id="avtor").findAll("span")
    (name, surname) = header[0].string.strip().split(",")[0].split(" ")
    (id, new) = Author.objects.get_or_create(name=name, surname=surname)
    a = Article()
    a.author = id
    a.title = soup.find("div", id="view").h1.string
    (day, month, year) = re.findall(r'(\d+)\.(\d+)\.(\d+)', header[1].string.strip())[0]
    a.date = datetime(int(year),int(month), int(day))
    a.description = soup.find("div", id="view").p.renderContents().strip()
    for item in soup.find("div", id="view").findAll("div"):
        if ("id" not in item) and ("class" not in item) and item.string != None and item.renderContents() != None :
            if len(item.renderContents().strip()) > 0:
                a.body += item.renderContents().strip() + "\n"
    i = randint(0, 46364202620)
    m = hashlib.md5()
    m.update(str(i))
    urlfile = soup.find("div", id="galImg").img["src"]
    a.image.save(m.hexdigest()+"."+urlfile.split(".")[-1], ContentFile(urllib.urlopen(urlfile).read()))
    a.save()

6. Django Models

from django.db import models
from django.contrib.auth.models import User

class Author(models.Model):
    name = models.CharField(max_length=100)
    surname = models.CharField(max_length=100)

    def articles(self):
        return Article.objects.filter(author = self).count()

    def __unicode__(self):
        return "%s %s" % (self.name, self.surname)

class Article(models.Model):
    title = models.CharField(max_length=100)
    author = models.ForeignKey(Author)
    date = models.DateField()
    description = models.TextField()
    body = models.TextField()
    image = models.FileField(upload_to="attachments")

    def __unicode__(self):
        return self.title

class UploaderProfile(models.Model):
    uploads = models.IntegerField()
    user = models.OneToOneField(User)

    def __unicode__(self):
        return "%s (%d files uploaded)" % (self.user.username, self.uploads)

Comments