VirtLands wrote:Hmmm, this post hasn't been updated in a while. Could be it's turning into a cobweb site.
Yes, haven't got time to work on it for a while.
Here's a very important bit of the spider - post scraping. This 70-lined code scraps post information and attachments, while initiating scrap of post content (will explain later why it's a separate process), user profile, and (of course) the scraping of next page.
It's 2:30 AM here now, I need to go to sleep.
Code: Select all
def parsePosts(self, response):
"""
Parse post content.
"""
soup = BeautifulSoup(response.body)
def aName(tag):
return tag.name == "a" and isinstance(tag["name"], int)
def aHref(tag):
return tag.name == "a" and tag["href"].startswith("profile.php?mode=viewprofile&u=")
def spanClass(tag):
return tag.name == "span" and tag["class"] == "postdetails" and tag.string.startswith("Posted: ")
def spanClassPostBody(tag):
return tag.name == "span" and \
tag["class"] == "postbody" and not \
tag.string.startswith("<br />_________________<br />")
def attachURL(tag):
return tag.name == "a" and tag["href"].startswith("download.php?id=")
def determineContentFetchMode(postid):
if soup.find("a", href="posting.php?mode=editpost&p=" + postid):
return "edit"
elif soup.find("a", href="posting.php?mode=quote&p=" + postid):
return "quote"
else:
return "raw"
# Find posts information
posts = []
attachments = []
for (pid, userid, username, posttime, content) in zip(
soup.find_all(aName), # Post ID
soup.find_all(aHref), # User ID
soup.find_all("span", attrs={"class": "name"}).b.string,
soup.find_all(spanClass),
soup.find_all(spanClassPostBody), # Post body
):
aPost = Post()
aPost["postID"] = pid
aPost["topicID"] = response.meta["topicID"]
aPost["posterID"] = userid.strip("profile.php?mode=viewprofile&u=")
aPost["postTime"] = posttime.strip("Posted: ")[0:9] # Timestamps are always 10 digits
attachTable = content.find_next_sibling("table", attrs={"class": "attachtable"})
# Attachment?
if attachTable:
anAttachment = Attachment()
anAttachment["postID"] = pid
anAttachment["originalFilename"] = attachTable.find("span", attrs={"class": "gen"}) # The original name
anAttachment["displayFilename"] = attachTable.find(attachURL)
# Initiate Post content scraping
mode = determineContentFetchMode(pid)
if mode != "raw":
yield Request((self.root_domain + "/" + "posting.php?mode=" +
("editpost" if p["content"][0] == "edit" else "quote") +
"&p=" + p["postID"]),
callback=self.parsePostContent,
meta={"postID": pid, "content": None, "mode": mode})
# Initiate User scraping
if username not in self.users_scanned:
yield Request((self.root_domain + "/" + userid),
callback=self.parseUser)
posts.append(aPost)
yield posts
# Figure out if there's tomorrow
hasMultiplePages = soup.find("td", align="left", valign="bottom", colspan=2)
if hasMultiplePages:
hasNextPage = hasMultiplePages.find("a", text="Next")
if hasNextPage:
yield Request((self.root_domain + "/" + hasNextPage["href"]),
callback=self.parseTopics,
meta={"topicID": response.meta["topicID"]})