functions for scrapping

This commit is contained in:
Idaapayo 2021-07-21 17:04:17 +03:00
parent 7a9e8339d2
commit 3c5f059759
2 changed files with 93 additions and 13 deletions

View File

@ -1,4 +1,3 @@
Community Currencies and DEX Multitudes
#########################################

105
scrape.py
View File

@ -1,21 +1,102 @@
from bs4 import BeautifulSoup
import requests
research = "https://www.grassrootseconomics.org/research"
html_text = requests.get(research).text
soup = BeautifulSoup(html_text, 'html.parser')
index_page = "https://www.grassrootseconomics.org/post/claims-and-currencies"
html_text = requests.get(index_page).text
soup = BeautifulSoup(html_text, 'lxml')
def findwriter(soup):
authors = soup.find_all('span', class_='iYG_V user-name _4AzY3')
for author in authors:
tag = author.text
out = ":author: "
strauth = out + tag
print(strauth)
# findwriter(soup)
def findtime(soup):
times = soup.find_all('span', class_='post-metadata__date time-ago')
for time in times:
tag = time.text
out = ":date: "
strauth = out + tag
print(strauth)
# findtime(soup)
def findtags(soup):
listtags = soup.find_all('li', class_='_3uJTw')
out = ":tags: "
apptags = []
for lists in listtags:
tags = lists.text
apptags.append(tags)
if len(apptags) > 1:
newstr = ",".join(apptags)
strout = out + newstr
print(strout)
else:
newstr = apptags[0]
strout = out + newstr
print(strout)
# findtags(soup)
def findmodified(soup):
try:
updated = soup.find('p', class_="_2aGvg _1AZWZ")
out = ":modified: "
for update in updated:
uptime = update.span
modified = uptime.text
modified = modified.replace('Updated:', '')
strout = out + modified
print(strout)
except:
print("no such class for modified date")
# findmodified(soup)
def findtitle(soup):
title = soup.find('span', class_='blog-post-title-font blog-post-title-color')
out = ':title: '
titletext = title.text
newtitle = out + titletext
return newtitle, titletext
tagtitle, text = findtitle(soup)
def findslug(title):
words = title.replace(',','').replace("'",'').replace(":", '').replace("(",'').replace(")",'')
words = words.split()
first = words[0]
second = words[1]
slug = first + "-" + second
slug = slug.lower()
print(slug)
# findslug(text)
def filtercontent(soup):
maincontent = soup.find('div', id="content-wrapper")
paragraphs = maincontent.find_all('p')
for par in paragraphs:
print(par.prettify())
# print(maincontent.prettify())
filtercontent(soup)
# print(soup.find_all(id=True))
# for tag in soup.find_all(True):
# print(tag.name)
def head_of_articles(soup):
file = open("ge-theme/static/scrapped-text/reseasrch/article-head.txt",'a+')
for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):
# print(match.p.text)
for words in match.find_all('em'):
text = words.text
file.write(text + "\n")
head_of_articles(soup)
# def head_of_articles(soup):
# file = open("ge-theme/static/scrapped-text/reseasrch/article-head.txt",'a+')
# for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):
# # print(match.p.text)
# for words in match.find_all('em'):
# text = words.text
# file.write(text + "\n")
#
# head_of_articles(soup)
# print(isinstance(head_of_articles(soup), list))
# for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):