extract images in scrape

This commit is contained in:
Idaapayo 2021-07-22 17:59:43 +03:00
parent 3c5f059759
commit f1537afee7
5 changed files with 40 additions and 7 deletions

BIN
content/images/blogtry0.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

BIN
content/images/blogtry1.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

BIN
content/images/try1.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

BIN
content/images/try2.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

View File

@ -1,9 +1,12 @@
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import requests import requests
import shutil
import urllib.parse
index_page = "https://www.grassrootseconomics.org/post/claims-and-currencies" index_page = "https://www.grassrootseconomics.org/post/claims-and-currencies"
html_text = requests.get(index_page).text html_text = requests.get(index_page).text
soup = BeautifulSoup(html_text, 'lxml') soup = BeautifulSoup(html_text, 'lxml')
imgdir = "content/images/blog"
def findwriter(soup): def findwriter(soup):
authors = soup.find_all('span', class_='iYG_V user-name _4AzY3') authors = soup.find_all('span', class_='iYG_V user-name _4AzY3')
@ -77,14 +80,44 @@ def findslug(title):
print(slug) print(slug)
# findslug(text) # findslug(text)
def filtercontent(soup): def finddownloadimg(soup):
maincontent = soup.find('div', id="content-wrapper") newtitle, titletext = findtitle(soup)
paragraphs = maincontent.find_all('p') imgsinpage = []
for par in paragraphs: divwrap = soup.find_all('div', class_="_3lvoN LPH2h")
print(par.prettify()) for wrap in divwrap:
# print(maincontent.prettify()) imgtags = wrap.img
imgsrc = imgtags.attrs['src']
imgsinpage.append(imgsrc)
for i, imgsrc in enumerate(imgsinpage):
r = requests.get(imgsrc, stream=True)
if r.status_code == 200:
filename = "/" + "try" + str(i+1) + ".jpg"
print(filename)
with open(urllib.parse.urljoin(imgdir, filename), 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
print("cannot find image")
finddownloadimg(soup)
# def filtercontent(soup):
# maincontent = soup.find('div', id="content-wrapper")
# paragraphs = maincontent.find_all('p')
# for par in paragraphs:
# print(par.prettify())
# # print(maincontent.prettify())
#
# filtercontent(soup)
filtercontent(soup)
# print(soup.find_all(id=True)) # print(soup.find_all(id=True))
# for tag in soup.find_all(True): # for tag in soup.find_all(True):
# print(tag.name) # print(tag.name)