functions for scrapping

2021-07-21 17:04:17 +03:00 · 2021-07-21 17:04:17 +03:00 · 3c5f059759
commit 3c5f059759
parent 7a9e8339d2
2 changed files with 93 additions and 13 deletions
--- a/content/blog/cic_will.rst
+++ b/content/blog/cic_will.rst
@ -1,4 +1,3 @@
 Community Currencies and DEX Multitudes
 #########################################
--- a/scrape.py
+++ b/scrape.py
@ -1,21 +1,102 @@
 from bs4 import BeautifulSoup
 import requests
-research = "https://www.grassrootseconomics.org/research"
+index_page = "https://www.grassrootseconomics.org/post/claims-and-currencies"
-html_text = requests.get(research).text
+html_text = requests.get(index_page).text
-soup = BeautifulSoup(html_text, 'html.parser')
+soup = BeautifulSoup(html_text, 'lxml')
 def findwriter(soup):
    authors = soup.find_all('span', class_='iYG_V user-name _4AzY3')
    for author in authors:
        tag = author.text
        out = ":author: "
        strauth = out + tag
        print(strauth)
 # findwriter(soup)
 def findtime(soup):
    times = soup.find_all('span', class_='post-metadata__date time-ago')
    for time in times:
        tag = time.text
        out = ":date: "
        strauth = out + tag
        print(strauth)
 # findtime(soup)
 def findtags(soup):
    listtags = soup.find_all('li', class_='_3uJTw')
    out = ":tags: "
    apptags = []
    for lists in listtags:
        tags = lists.text
        apptags.append(tags)
    if len(apptags) > 1:
        newstr = ",".join(apptags)
        strout = out + newstr
        print(strout)
    else:
        newstr = apptags[0]
        strout = out + newstr
        print(strout)
 # findtags(soup)
 def findmodified(soup):
    try:
        updated = soup.find('p', class_="_2aGvg _1AZWZ")
        out = ":modified: "
        for update in updated:
            uptime = update.span
            modified = uptime.text
            modified = modified.replace('Updated:', '')
            strout = out + modified
            print(strout)
    except:
        print("no such class for modified date")
 # findmodified(soup)
 def findtitle(soup):
    title = soup.find('span', class_='blog-post-title-font blog-post-title-color')
    out = ':title: '
    titletext = title.text
    newtitle = out + titletext
    return newtitle, titletext
 tagtitle, text = findtitle(soup)
 def findslug(title):
    words = title.replace(',','').replace("'",'').replace(":", '').replace("(",'').replace(")",'')
    words = words.split()
    first = words[0]
    second = words[1]
    slug = first + "-" + second
    slug = slug.lower()
    print(slug)
 # findslug(text)
 def filtercontent(soup):
    maincontent = soup.find('div', id="content-wrapper")
    paragraphs = maincontent.find_all('p')
    for par in paragraphs:
        print(par.prettify())
    # print(maincontent.prettify())
 filtercontent(soup)
 # print(soup.find_all(id=True))
 # for tag in soup.find_all(True):
 #     print(tag.name)
-def head_of_articles(soup):
+# def head_of_articles(soup):
-    file = open("ge-theme/static/scrapped-text/reseasrch/article-head.txt",'a+')
+#     file = open("ge-theme/static/scrapped-text/reseasrch/article-head.txt",'a+')
-    for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):
+#     for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):
-        # print(match.p.text)
+#         # print(match.p.text)
-        for words in match.find_all('em'):
+#         for words in match.find_all('em'):
-            text = words.text
+#             text = words.text
-            file.write(text + "\n")
+#             file.write(text + "\n")
-
+#
-head_of_articles(soup)
+# head_of_articles(soup)
 # print(isinstance(head_of_articles(soup), list))
 # for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):
`@ -1,4 +1,3 @@`

	`Community Currencies and DEX Multitudes`	`Community Currencies and DEX Multitudes`
	`#########################################`	`#########################################`