extract images in scrape
This commit is contained in:
parent
3c5f059759
commit
f1537afee7
BIN
content/images/blogtry0.jpg
Normal file
BIN
content/images/blogtry0.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 39 KiB |
BIN
content/images/blogtry1.jpg
Normal file
BIN
content/images/blogtry1.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 27 KiB |
BIN
content/images/try1.jpg
Normal file
BIN
content/images/try1.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 39 KiB |
BIN
content/images/try2.jpg
Normal file
BIN
content/images/try2.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 27 KiB |
47
scrape.py
47
scrape.py
@ -1,9 +1,12 @@
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import requests
|
import requests
|
||||||
|
import shutil
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
index_page = "https://www.grassrootseconomics.org/post/claims-and-currencies"
|
index_page = "https://www.grassrootseconomics.org/post/claims-and-currencies"
|
||||||
html_text = requests.get(index_page).text
|
html_text = requests.get(index_page).text
|
||||||
soup = BeautifulSoup(html_text, 'lxml')
|
soup = BeautifulSoup(html_text, 'lxml')
|
||||||
|
imgdir = "content/images/blog"
|
||||||
|
|
||||||
def findwriter(soup):
|
def findwriter(soup):
|
||||||
authors = soup.find_all('span', class_='iYG_V user-name _4AzY3')
|
authors = soup.find_all('span', class_='iYG_V user-name _4AzY3')
|
||||||
@ -77,14 +80,44 @@ def findslug(title):
|
|||||||
print(slug)
|
print(slug)
|
||||||
# findslug(text)
|
# findslug(text)
|
||||||
|
|
||||||
def filtercontent(soup):
|
def finddownloadimg(soup):
|
||||||
maincontent = soup.find('div', id="content-wrapper")
|
newtitle, titletext = findtitle(soup)
|
||||||
paragraphs = maincontent.find_all('p')
|
imgsinpage = []
|
||||||
for par in paragraphs:
|
divwrap = soup.find_all('div', class_="_3lvoN LPH2h")
|
||||||
print(par.prettify())
|
for wrap in divwrap:
|
||||||
# print(maincontent.prettify())
|
imgtags = wrap.img
|
||||||
|
imgsrc = imgtags.attrs['src']
|
||||||
|
imgsinpage.append(imgsrc)
|
||||||
|
|
||||||
|
for i, imgsrc in enumerate(imgsinpage):
|
||||||
|
r = requests.get(imgsrc, stream=True)
|
||||||
|
if r.status_code == 200:
|
||||||
|
filename = "/" + "try" + str(i+1) + ".jpg"
|
||||||
|
print(filename)
|
||||||
|
with open(urllib.parse.urljoin(imgdir, filename), 'wb') as f:
|
||||||
|
r.raw.decode_content = True
|
||||||
|
shutil.copyfileobj(r.raw, f)
|
||||||
|
else:
|
||||||
|
print("cannot find image")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
finddownloadimg(soup)
|
||||||
|
# def filtercontent(soup):
|
||||||
|
# maincontent = soup.find('div', id="content-wrapper")
|
||||||
|
# paragraphs = maincontent.find_all('p')
|
||||||
|
# for par in paragraphs:
|
||||||
|
# print(par.prettify())
|
||||||
|
# # print(maincontent.prettify())
|
||||||
|
#
|
||||||
|
# filtercontent(soup)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
filtercontent(soup)
|
|
||||||
# print(soup.find_all(id=True))
|
# print(soup.find_all(id=True))
|
||||||
# for tag in soup.find_all(True):
|
# for tag in soup.find_all(True):
|
||||||
# print(tag.name)
|
# print(tag.name)
|
||||||
|
Loading…
Reference in New Issue
Block a user