From 3c5f0597591cd1f34f871039da142db2320d95b4 Mon Sep 17 00:00:00 2001 From: Idaapayo Date: Wed, 21 Jul 2021 17:04:17 +0300 Subject: [PATCH] functions for scrapping --- content/blog/cic_will.rst | 1 - scrape.py | 105 +++++++++++++++++++++++++++++++++----- 2 files changed, 93 insertions(+), 13 deletions(-) diff --git a/content/blog/cic_will.rst b/content/blog/cic_will.rst index 42ea3b5..58918f7 100644 --- a/content/blog/cic_will.rst +++ b/content/blog/cic_will.rst @@ -1,4 +1,3 @@ - Community Currencies and DEX Multitudes ######################################### diff --git a/scrape.py b/scrape.py index eea6236..c8f77c8 100644 --- a/scrape.py +++ b/scrape.py @@ -1,21 +1,102 @@ from bs4 import BeautifulSoup import requests -research = "https://www.grassrootseconomics.org/research" -html_text = requests.get(research).text -soup = BeautifulSoup(html_text, 'html.parser') +index_page = "https://www.grassrootseconomics.org/post/claims-and-currencies" +html_text = requests.get(index_page).text +soup = BeautifulSoup(html_text, 'lxml') + +def findwriter(soup): + authors = soup.find_all('span', class_='iYG_V user-name _4AzY3') + for author in authors: + tag = author.text + out = ":author: " + strauth = out + tag + print(strauth) + +# findwriter(soup) + +def findtime(soup): + times = soup.find_all('span', class_='post-metadata__date time-ago') + for time in times: + tag = time.text + out = ":date: " + strauth = out + tag + print(strauth) + +# findtime(soup) + +def findtags(soup): + listtags = soup.find_all('li', class_='_3uJTw') + out = ":tags: " + apptags = [] + for lists in listtags: + tags = lists.text + apptags.append(tags) + if len(apptags) > 1: + newstr = ",".join(apptags) + strout = out + newstr + print(strout) + else: + newstr = apptags[0] + strout = out + newstr + print(strout) + +# findtags(soup) + +def findmodified(soup): + try: + updated = soup.find('p', class_="_2aGvg _1AZWZ") + out = ":modified: " + for update in updated: + uptime = update.span + modified = uptime.text + modified = modified.replace('Updated:', '') + strout = out + modified + print(strout) + except: + print("no such class for modified date") + +# findmodified(soup) + +def findtitle(soup): + title = soup.find('span', class_='blog-post-title-font blog-post-title-color') + out = ':title: ' + titletext = title.text + newtitle = out + titletext + return newtitle, titletext + +tagtitle, text = findtitle(soup) + +def findslug(title): + words = title.replace(',','').replace("'",'').replace(":", '').replace("(",'').replace(")",'') + words = words.split() + first = words[0] + second = words[1] + slug = first + "-" + second + slug = slug.lower() + print(slug) +# findslug(text) + +def filtercontent(soup): + maincontent = soup.find('div', id="content-wrapper") + paragraphs = maincontent.find_all('p') + for par in paragraphs: + print(par.prettify()) + # print(maincontent.prettify()) + +filtercontent(soup) # print(soup.find_all(id=True)) # for tag in soup.find_all(True): # print(tag.name) -def head_of_articles(soup): - file = open("ge-theme/static/scrapped-text/reseasrch/article-head.txt",'a+') - for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'): - # print(match.p.text) - for words in match.find_all('em'): - text = words.text - file.write(text + "\n") - -head_of_articles(soup) +# def head_of_articles(soup): +# file = open("ge-theme/static/scrapped-text/reseasrch/article-head.txt",'a+') +# for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'): +# # print(match.p.text) +# for words in match.find_all('em'): +# text = words.text +# file.write(text + "\n") +# +# head_of_articles(soup) # print(isinstance(head_of_articles(soup), list)) # for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):