functions for scrapping
This commit is contained in:
parent
7a9e8339d2
commit
3c5f059759
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
Community Currencies and DEX Multitudes
|
Community Currencies and DEX Multitudes
|
||||||
#########################################
|
#########################################
|
||||||
|
|
||||||
|
105
scrape.py
105
scrape.py
@ -1,21 +1,102 @@
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
research = "https://www.grassrootseconomics.org/research"
|
index_page = "https://www.grassrootseconomics.org/post/claims-and-currencies"
|
||||||
html_text = requests.get(research).text
|
html_text = requests.get(index_page).text
|
||||||
soup = BeautifulSoup(html_text, 'html.parser')
|
soup = BeautifulSoup(html_text, 'lxml')
|
||||||
|
|
||||||
|
def findwriter(soup):
|
||||||
|
authors = soup.find_all('span', class_='iYG_V user-name _4AzY3')
|
||||||
|
for author in authors:
|
||||||
|
tag = author.text
|
||||||
|
out = ":author: "
|
||||||
|
strauth = out + tag
|
||||||
|
print(strauth)
|
||||||
|
|
||||||
|
# findwriter(soup)
|
||||||
|
|
||||||
|
def findtime(soup):
|
||||||
|
times = soup.find_all('span', class_='post-metadata__date time-ago')
|
||||||
|
for time in times:
|
||||||
|
tag = time.text
|
||||||
|
out = ":date: "
|
||||||
|
strauth = out + tag
|
||||||
|
print(strauth)
|
||||||
|
|
||||||
|
# findtime(soup)
|
||||||
|
|
||||||
|
def findtags(soup):
|
||||||
|
listtags = soup.find_all('li', class_='_3uJTw')
|
||||||
|
out = ":tags: "
|
||||||
|
apptags = []
|
||||||
|
for lists in listtags:
|
||||||
|
tags = lists.text
|
||||||
|
apptags.append(tags)
|
||||||
|
if len(apptags) > 1:
|
||||||
|
newstr = ",".join(apptags)
|
||||||
|
strout = out + newstr
|
||||||
|
print(strout)
|
||||||
|
else:
|
||||||
|
newstr = apptags[0]
|
||||||
|
strout = out + newstr
|
||||||
|
print(strout)
|
||||||
|
|
||||||
|
# findtags(soup)
|
||||||
|
|
||||||
|
def findmodified(soup):
|
||||||
|
try:
|
||||||
|
updated = soup.find('p', class_="_2aGvg _1AZWZ")
|
||||||
|
out = ":modified: "
|
||||||
|
for update in updated:
|
||||||
|
uptime = update.span
|
||||||
|
modified = uptime.text
|
||||||
|
modified = modified.replace('Updated:', '')
|
||||||
|
strout = out + modified
|
||||||
|
print(strout)
|
||||||
|
except:
|
||||||
|
print("no such class for modified date")
|
||||||
|
|
||||||
|
# findmodified(soup)
|
||||||
|
|
||||||
|
def findtitle(soup):
|
||||||
|
title = soup.find('span', class_='blog-post-title-font blog-post-title-color')
|
||||||
|
out = ':title: '
|
||||||
|
titletext = title.text
|
||||||
|
newtitle = out + titletext
|
||||||
|
return newtitle, titletext
|
||||||
|
|
||||||
|
tagtitle, text = findtitle(soup)
|
||||||
|
|
||||||
|
def findslug(title):
|
||||||
|
words = title.replace(',','').replace("'",'').replace(":", '').replace("(",'').replace(")",'')
|
||||||
|
words = words.split()
|
||||||
|
first = words[0]
|
||||||
|
second = words[1]
|
||||||
|
slug = first + "-" + second
|
||||||
|
slug = slug.lower()
|
||||||
|
print(slug)
|
||||||
|
# findslug(text)
|
||||||
|
|
||||||
|
def filtercontent(soup):
|
||||||
|
maincontent = soup.find('div', id="content-wrapper")
|
||||||
|
paragraphs = maincontent.find_all('p')
|
||||||
|
for par in paragraphs:
|
||||||
|
print(par.prettify())
|
||||||
|
# print(maincontent.prettify())
|
||||||
|
|
||||||
|
filtercontent(soup)
|
||||||
# print(soup.find_all(id=True))
|
# print(soup.find_all(id=True))
|
||||||
# for tag in soup.find_all(True):
|
# for tag in soup.find_all(True):
|
||||||
# print(tag.name)
|
# print(tag.name)
|
||||||
def head_of_articles(soup):
|
# def head_of_articles(soup):
|
||||||
file = open("ge-theme/static/scrapped-text/reseasrch/article-head.txt",'a+')
|
# file = open("ge-theme/static/scrapped-text/reseasrch/article-head.txt",'a+')
|
||||||
for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):
|
# for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):
|
||||||
# print(match.p.text)
|
# # print(match.p.text)
|
||||||
for words in match.find_all('em'):
|
# for words in match.find_all('em'):
|
||||||
text = words.text
|
# text = words.text
|
||||||
file.write(text + "\n")
|
# file.write(text + "\n")
|
||||||
|
#
|
||||||
head_of_articles(soup)
|
# head_of_articles(soup)
|
||||||
# print(isinstance(head_of_articles(soup), list))
|
# print(isinstance(head_of_articles(soup), list))
|
||||||
|
|
||||||
# for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):
|
# for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):
|
||||||
|
Loading…
Reference in New Issue
Block a user