more functions for the soup

2021-07-23 17:41:47 +03:00 · 2021-07-23 17:41:47 +03:00 · 76d7e0dc6f
parent f1537afee7
commit 76d7e0dc6f
4 changed files with 70 additions and 14 deletions
--- a/content/images/blog/claims-currencies1.webp
+++ b/content/images/blog/claims-currencies1.webp
--- a/content/images/blog/claims-currencies2.webp
+++ b/content/images/blog/claims-currencies2.webp
--- a/content/images/blog/gre-for1.webp
+++ b/content/images/blog/gre-for1.webp
--- a/scrape.py
+++ b/scrape.py
@ -1,9 +1,9 @@
 from bs4 import BeautifulSoup
 import requests
 import shutil
 import urllib.parse
-index_page = "https://www.grassrootseconomics.org/post/claims-and-currencies"
+
 index_page = "https://www.grassrootseconomics.org/post/recycling-debt"
 html_text = requests.get(index_page).text
 soup = BeautifulSoup(html_text, 'lxml')
 imgdir = "content/images/blog"
@ -69,6 +69,7 @@ def findtitle(soup):
    return newtitle, titletext
 tagtitle, text = findtitle(soup)
 print(tagtitle)
 def findslug(title):
    words = title.replace(',','').replace("'",'').replace(":", '').replace("(",'').replace(")",'')
@ -77,11 +78,13 @@ def findslug(title):
    second = words[1]
    slug = first + "-" + second
    slug = slug.lower()
-    print(slug)
+    return slug
-# findslug(text)
+
 # print(findslug(text))
 def finddownloadimg(soup):
-    newtitle, titletext = findtitle(soup)
+    title, slugtitle = findtitle(soup)
    titletext = findslug(slugtitle)
    imgsinpage = []
    divwrap = soup.find_all('div', class_="_3lvoN LPH2h")
    for wrap in divwrap:
@ -92,26 +95,79 @@ def finddownloadimg(soup):
    for i, imgsrc in enumerate(imgsinpage):
        r = requests.get(imgsrc, stream=True)
        if r.status_code == 200:
-            filename = "/" + "try" + str(i+1) + ".jpg"
+            filename = "/" + str(titletext) + str(i+1) + ".webp"
-            print(filename)
+            pathtofile = imgdir + filename
-            with open(urllib.parse.urljoin(imgdir, filename), 'wb') as f:
+            # print(pathtofile)
            with open(pathtofile, 'wb') as f:
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)
        else:
            print("cannot find image")
 # finddownloadimg(soup)
 def changehrefs(soup):
    articlesection = soup.find('div', class_ = "post-content__body")
    alllinks = articlesection.find_all('a', class_="_2qJYG _2E8wo")
    for link in alllinks:
        linktext = link.text
        linkhref = link['href']
        newlinks = "`" + linktext + "<" + linkhref + ">" + "`" + "_"
        print(newlinks)
 # changehrefs(soup)
 def subtitles(soup):
    bold = soup.find_all('h2', class_= "_3f-vr _208Ie blog-post-title-font _1Hxbl _3SkfC _2QAo- _25MYV _2WrB- _1atvN public-DraftStyleDefault-block-depth0 public-DraftStyleDefault-text-ltr")
    for b in bold:
        text = b.text
        newtext = text + "\n*************************************************"
        print(newtext)
 # subtitles(soup)
 def italics(soup):
    itl = soup.find_all('em')
    for i in itl:
        text = i.text.lstrip().rstrip()
        newtext = "*"+ text + "*"
        print(newtext)
 # italics(soup)
 def bold(soup):
    boldt = soup.find_all('strong')
    for bt in boldt:
        txt = bt.text.lstrip().rstrip()
        newtxt = "**"+txt+"**"
        print(newtxt)
 # bold(soup)
 # def iframeproces(soup):
    # iframe = soup.find('iframe', id="widget2")
    # print(iframe)
    # content = soup.find_all('div', class_= "post-content__body")
    # for tag in soup.find_all(True):
    #     print(tag.name)
    # wrap = content.find('div', class_="o56NN")
    # for w in wrap.children:
    #     print(w)
    # for w in wrap:
    #     iframe = w.decendants
    #     print(iframe)
    # for frame in iframes:
    #     print(frame)
 # iframeproces(soup)
 finddownloadimg(soup)
 # def filtercontent(soup):
 #     maincontent = soup.find('div', id="content-wrapper")
-#     paragraphs = maincontent.find_all('p')
+#
 #     for par in paragraphs:
 #         print(par.prettify())
 #     # print(maincontent.prettify())
 #
 # filtercontent(soup)