pelican-website-ge/scrape.py

227 lines
6.3 KiB
Python

from bs4 import BeautifulSoup
import requests
import shutil
from selenium import webdriver
import time
import logging
browser = webdriver.Firefox()
browser.get("https://www.grassrootseconomics.org/blog")
browser.implicitly_wait(90)
scrollnumber = 13
for i in range(1, scrollnumber):
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
gretext = "https://www.grassrootseconomics.org/post/gre-for-me"
html_text = requests.get(gretext).text
soup = BeautifulSoup(html_text, 'lxml')
mainsoup = BeautifulSoup(browser.page_source, 'lxml')
bloglinks = []
atitles = mainsoup.findAll('a', class_="_2oveR _2llBS _1e-gz _3T8tF")
for a in atitles:
hrefs = a['href']
bloglinks.append(hrefs)
print(len(bloglinks))
imgdir = "content/images/blog"
storeimg = "images/blog"
logging.basicConfig(filename='loginginfo.log', encoding='utf-8', level=logging.INFO)
logging.info("Running scrape")
def findwriter(soup):
authors = soup.find_all('span', class_='iYG_V user-name _4AzY3')
for author in authors:
tag = author.text
out = ":author: "
strauth = out + tag
# print(strauth)
return strauth
# findwriter(soup)
def findtime(soup):
times = soup.find_all('span', class_='post-metadata__date time-ago')
for time in times:
tag = time.text
out = ":date: "
strauth = out + tag
# print(strauth)
return strauth
# findtime(soup)
def findtags(soup):
try:
listtags = soup.find_all('li', class_='_3uJTw')
out = ":tags: "
apptags = []
for lists in listtags:
tags = lists.text
apptags.append(tags)
if len(apptags) > 1:
newstr = ",".join(apptags)
strout = out + newstr
# print(strout)
else:
newstr = apptags[0]
strout = out + newstr
# print(strout)
return strout
except:
# print("no tags found here")
return " "
# findtags(soup)
def findmodified(soup):
try:
updated = soup.find('p', class_="_2aGvg _1AZWZ")
out = ":modified: "
uptime = updated.span
modified = uptime.text
modified = modified.replace('Updated:', '')
strout = out + modified
return strout
except:
# print("no such class for modified date")
return " "
# findmodified(soup)
def findtitle(soup):
title = soup.find('span', class_='blog-post-title-font blog-post-title-color')
out = ':title: '
titletext = title.text
newtitle = out + titletext
return newtitle, titletext
# tagtitle, text = findtitle(soup)
# print(tagtitle)
def findslug(title):
words = title.replace(',','').replace("'",'').replace(":", '').replace("(",'').replace(")",'').replace("&",'')
words = words.split()
first = words[0]
second = words[1]
slug = first + "-" + second
slug = slug.lower()
slugwrite = ":slug: " +slug
return slug, slugwrite
# def iframeproces(soup):
# iframe = soup.find('iframe', id="widget2")
# print(iframe)
# content = soup.find_all('div', class_= "post-content__body")
# for tag in soup.find_all(True):
# print(tag.name)
# wrap = content.find('div', class_="o56NN")
# for w in wrap.children:
# print(w)
# for w in wrap:
# iframe = w.decendants
# print(iframe)
# for frame in iframes:
# print(frame)
# iframeproces(soup)
def filtercontent(soup):
maincontent = soup.find('div', class_="_6SyeS")
title, words = findtitle(soup)
author = findwriter(soup)
date = findtime(soup)
slug, slugtext = findslug(words)
tags = findtags(soup)
modified = findmodified(soup)
blogpath = "content/blog/"
filename = slug + '.rst'
filepath = blogpath + filename
fileobj = open(filepath, 'a')
list = [title +"\n", author+ "\n", date+"\n", slugtext+"\n", modified+"\n", tags+"\n\n"]
fileobj.writelines(list)
print(filepath)
for i, tag in enumerate(maincontent.find_all(True)):
if tag.name == 'li':
newlist = "\t" + "*" + " " + tag.text + "\n"
fileobj.write(newlist)
if tag.name == 'a':
linktext = tag.text
linkhref = tag['href']
newlinks = "\t" + "`" + linktext + " " + "<" + linkhref + ">" + "`" + "_" + "\t"
# print(newlinks)
fileobj.write(newlinks)
if tag.name == 'span':
for child in tag.children:
if child.name == None:
spantext = "\n\n" + tag.text + "\n\n"
fileobj.write(spantext)
if child.name == 'h2':
text = tag.text
newtext = "\n" + text + "\n*******************************************************\n\n"
fileobj.write(newtext)
if child.name == 'strong':
txt = tag.text.lstrip().rstrip()
newtxt = "\t"+"**" + txt + "**"+"\t"
fileobj.write(newtxt)
if child.name == 'em':
text = tag.text.lstrip().rstrip()
newtext = "\t" +"*" + text + "*" + "\n"
fileobj.write(newtext)
if tag.name == 'img':
title, slugtitle = findtitle(soup)
titletext, _ = findslug(slugtitle)
imgsrc = tag.attrs['data-pin-media']
r = requests.get(imgsrc, stream=True)
# print(f"request-details: {r.url} \nrequest-headers: {r.headers.items()}")
# print(r)
if r.status_code == 200:
filename = "/" + str(titletext) + str(i + 1) + ".webp"
pathtofile = imgdir + filename
storedimg = storeimg + filename
# print(pathtofile)
with open(pathtofile, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
imagrst = "\n\n"+".. image:: " + storedimg + "\n\n"
fileobj.write(imagrst)
else:
logging.info(f"cannot find image here {slug}")
if tag.name == 'iframe':
logging.info(f"iframe found here {slug}")
# filtercontent(soup)
for links in bloglinks[:5]:
html_text = requests.get(links).text
soup = BeautifulSoup(html_text, 'lxml')
filtercontent(soup)