BeautifulSoup Parsing
1. Import dependencies
import urllib from BeautifulSoup import BeautifulSoup
2. Settings
site = "http://****.in.ua" base = site + "/url/****.html" parse_urls = ["?page=1",] parsed = [] urls = []
3. prepopulate urls bank with paging
def parser(fun): element = parse_urls.pop() parsed.append(element) page = urllib.urlopen(base + element) soup = BeautifulSoup(page.read()) for topic in soup.findAll(True, 'right_block'): urls.append(topic.p.a["href"]) for link in soup.find(id="page_list").findAll('li'): if (link.a["href"] not in parse_urls and link.a["href"] not in parsed): parse_urls.append(link.a["href"]) while(len(parse_urls) != 0): parser(blog_parse)
4. Parse
pages = [] for url in urls: page = urllib.urlopen(site + urllib.quote(url.encode("cp1251"))) soup = BeautifulSoup(page.read()) print url a = parse_item(soup) pages.append(a)
5. Populate Django DB ORM
def parse_item(soup): header = soup.find("div", id="avtor").findAll("span") (name, surname) = header[0].string.strip().split(",")[0].split(" ") (id, new) = Author.objects.get_or_create(name=name, surname=surname) a = Article() a.author = id a.title = soup.find("div", id="view").h1.string (day, month, year) = re.findall(r'(\d+)\.(\d+)\.(\d+)', header[1].string.strip())[0] a.date = datetime(int(year),int(month), int(day)) a.description = soup.find("div", id="view").p.renderContents().strip() for item in soup.find("div", id="view").findAll("div"): if ("id" not in item) and ("class" not in item) and item.string != None and item.renderContents() != None : if len(item.renderContents().strip()) > 0: a.body += item.renderContents().strip() + "\n" i = randint(0, 46364202620) m = hashlib.md5() m.update(str(i)) urlfile = soup.find("div", id="galImg").img["src"] a.image.save(m.hexdigest()+"."+urlfile.split(".")[-1], ContentFile(urllib.urlopen(urlfile).read())) a.save()
6. Django Models
from django.db import models from django.contrib.auth.models import User class Author(models.Model): name = models.CharField(max_length=100) surname = models.CharField(max_length=100) def articles(self): return Article.objects.filter(author = self).count() def __unicode__(self): return "%s %s" % (self.name, self.surname) class Article(models.Model): title = models.CharField(max_length=100) author = models.ForeignKey(Author) date = models.DateField() description = models.TextField() body = models.TextField() image = models.FileField(upload_to="attachments") def __unicode__(self): return self.title class UploaderProfile(models.Model): uploads = models.IntegerField() user = models.OneToOneField(User) def __unicode__(self): return "%s (%d files uploaded)" % (self.user.username, self.uploads)
Category: beautifulsoup, django, parsing, python
Comments