Andreas Demmelbauer 5 роки тому
коміт
7e0a95212c
6 змінених файлів з 298 додано та 0 видалено
  1. +1
    -0
      .gitignore
  2. +61
    -0
      README.md
  3. +13
    -0
      config.example.json
  4. +217
    -0
      feedcake.py
  5. +2
    -0
      public/.gitignore
  6. +4
    -0
      requirements.txt

+ 1
- 0
.gitignore Переглянути файл

@@ -0,0 +1 @@
config.json

+ 61
- 0
README.md Переглянути файл

@@ -0,0 +1,61 @@
# Feedcake
## "Gib mir ein Stück Kuchen und ich will den ganzen cake."

### The Problem
Most news platforms don't give you the full article via rss/atom.
This wouldn't be a big problem. But some of them do crazy 1984-ish stuff on their
websites or they have built up paywalls for users with privacy addons.

### Goal of this script
Getting a full-featured news feed (full articles with images) from various
news pages

### Benefits for the user
* They don't need to go on the website to read the articles
* No ads
* No tracking

### Possible downsides for the user
* articles don't get updated once they are scraped
* articles arrive with some delay
* interactive/special elements in articles may not work

### What it does
* Fetching the news feed from the original website
* scrape contents of new entries and save them into a directory structure
* save a full featured RSS file

### ... and what it doesn't
* Managing when it scrapes (use crontab or sth else for that)
* serving the feeds and assets via HTTPS (use your favorite web server for that)
* Dealing with article comments
* Archiving feeds (But content and assets - but without meta data)
* Using some sort of database (the file structure is everything)
* Cleaning up old assets
* Automaticly updating the basedir if it changed.

### Ugly stuff?
* the html files (feed content) get stored along the assets, even if they don't
need to be exploited via HTTPS.

### How to use
* git clone this project and enter directory
* install python3, pip and virtualenv
* Create virtualenv: `virtualenv -p python3 ~/.virtualenvs/feedcake`
* Activate your new virtualenv: `source ~/.virtualenvs/feedcake/bin/activate`
* switch into the projects directory: `cd feedcake`
* Install requirements: `pip3 install -r requirements.txt`
* copy the config-example: `cp config-example.json config.json`.
* edit `config.json`
* copy the cron-example: `cp cron-example.sh cron.sh`.
* edit `cron.sh`
* add cronjob for `cron.sh`: `crontab -e`
* `*/5 * * * * /absolute/path/to/cron.sh > /path/to/logfile 2>&1`
* setup your webserver: the `base_url` must point to the `public` directory
You should add basic http authentication or at least keep the url private
* After running the script the first time, your desired feed is available at
`base_url/destination` (e.g. `https://yourdomain.tld/some-url/newspaper.xml`)

### TODOs
* Decide what should happen with old news articles and assets which are not
listed in the current feed anymore.

+ 13
- 0
config.example.json Переглянути файл

@@ -0,0 +1,13 @@
{
"base_url" : "https://yourdomain.tld/some-url",
"feeds" : [
{
"source" : "https://a.newspaper.tld/news.xml",
"destination": "newspaper.xml"
},
{
"source" : "https://another.newspaper.tld/rss",
"destination": "another-newspaper.xml"
}
]
}

+ 217
- 0
feedcake.py Переглянути файл

@@ -0,0 +1,217 @@
# This script is intended for personal and scientific use only

import os
import sys
import re
import hashlib
import json
from time import sleep
import feedparser
import requests
from bs4 import BeautifulSoup, Comment
from bs4.element import CData


# default config location is a 'config.json' next to the script.
try:
filedir = os.path.dirname(os.path.abspath(__file__))
if len(sys.argv) < 2:
configpath = filedir+'/config.json'
print("Using default config location: ", configpath)
config = json.load(open(configpath))
else:
configpath = sys.argv[1]
config = json.load(open(configpath))

except:
print("Problem reading config file: ", configpath)
print("ERROR: Config file not found or invalid!")
sys.exit(1)

print(filedir)
public_path = filedir + '/public'
assets_path = public_path + '/assets'

# e.g. https://example.com/some-string
base_url = config['base_url']




# "I'm a robot which promises you clicks and $ ... Give me ALL your content!"
requestheaders = {
'user-'+'age'+'nt' :
'Mo' + 'zill' + 'a/5.' + '0 (' + 'comp' + 'ati' + 'ble; '+'Go'
+ 'og'+'le'+ 'bo' + 't/' + '2.1; +http' + '://www.' + 'go'
+ 'og'+ 'le'+'.com/'+'bo'+'t.html)'
}




# need filname safe strings for storing images along html files
def get_valid_filename(s):
s = str(s).split('?')[0].strip().strip('/').strip('http://').strip('https://').replace(' ', '-')
return re.sub(r'(?u)[^-\w.]', '-', s)

# Get a unique and valid filename from URL (for images)
def filename_from_url(url):
# remove get attributes and path
new_filename = url.split('?')[0].split('/')[-1]
# Split filename
new_filename = new_filename.split('.')
# insert a hash before suffix
new_filename.insert(1, str(hashlib.md5(url.encode('utf-8')).hexdigest()) )
# convert back to string and extra validate
new_filename = get_valid_filename('.'.join(new_filename))
return new_filename

# Download images and so on
def download_image(url, entry_dir, filename):
# take care of protocol relative URLs ... let's just assume that https works.
if url.startswith('//'):
url = 'https:'+url
response = requests.get(url, headers=requestheaders)
if response.status_code == 200:
with open(assets_path + '/' + entry_dir + '/' + filename, 'wb') as f:
#f.write(response.content)
for chunk in response.iter_content(1024):
f.write(chunk)

def process_feed(feed_url, output_filename):

# Get the feed
r_feed = requests.get(feed_url, headers=requestheaders)

# TODO: exceptions.(what if 404 or whatever?)

# Store data of new articles
for entry in feedparser.parse(r_feed.text).entries:
print(entry.link)
entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
entry_path = assets_path + '/'+ entry_dir
if not os.path.exists(entry_path):
r = requests.get(entry.link.split('?')[0], headers=requestheaders)

online_soup = BeautifulSoup(r.text, 'html.parser')

content_soup = BeautifulSoup('<article></article>', 'html.parser')

# Remove all Comments
for element in online_soup(text=lambda text: isinstance(text, Comment)):
element.extract()

# domain and path specific rules
# ... split strings for (very simple) ob+fu+sca+tion

if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'):
if entry.date:
article_time = content_soup.new_tag('time', datetime=entry.date)
content_soup.article.append(article_time)
article_headline = online_soup.find('h1', attrs={'class': 'story-lead-headline'})
content_soup.article.append(article_headline)
article_body = online_soup.find('div', attrs={'class': 'story-content'})
content_soup.article.append(article_body)
article_link = content_soup.new_tag('a', href=entry.link)
article_link['class'] = 'source';
article_link.string = 'Quelle (' + entry.link + ')'
content_soup.article.append(article_link)

if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :)
print(entry)
if entry.published:
article_time = content_soup.new_tag('time', datetime=entry.published)
content_soup.article.append(article_time)
article_headline = online_soup.find('h1', attrs={'itemprop': 'headline'})
content_soup.article.append(article_headline)
# images etc
article_aside = online_soup.find('div', id="content-aside")
content_soup.article.append(article_aside)
article_body = online_soup.find('div', attrs={'itemprop': 'articleBody'})
content_soup.article.append(article_body)
article_link = content_soup.new_tag('a', href=entry.link)
article_link['class'] = 'source';
article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')'
content_soup.article.append(article_link)

# modify original link -> mobile version and comment section
link_to_comments = re.sub(r'(\/\/)', r'\1mobil.',entry.link.split('?')[0]) + '?_viewMode=forum#'
article_comments_link = content_soup.new_tag('a', href=link_to_comments)
article_comments_link['class'] = 'comments';
article_comments_link.sting = 'Kommentare'
content_soup.article.append(article_comments_link)

article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')'
content_soup.article.append(article_link)


# create directory for storing and serving html and images
os.makedirs(entry_path)

# download all article images and replace image source
for img in content_soup.findAll('img'):
print(img)
if img.get('data-src'):
old_url = img['data-src']
print(old_url)
if not old_url.startswith('data:'):
new_filename = filename_from_url(old_url)
img['data-src'] = base_url + '/' + entry_dir + '/' + new_filename
download_image(old_url, entry_dir, new_filename)
if img.get('src'):
old_url = img['src']
print(old_url)
if not old_url.startswith('data:'):
new_filename = filename_from_url(old_url)
img['src'] = base_url + '/' + entry_dir + '/' + new_filename
download_image(old_url, entry_dir, new_filename)
if img.get('data-srcset'):
srcset = img['data-srcset'].split(', ')
print(old_url)
new_srcset = []
for src in srcset:
old_url = src.split(' ')[0]
src_res = src.split(' ')[1]
new_filename = filename_from_url(old_url)
download_image(old_url, entry_dir, new_filename)
new_url = base_url + '/' + entry_dir + '/' + new_filename
src = ' '.join([new_url, src_res])
new_srcset.append(src)
img['data-srcset'] = ', '.join(new_srcset)

# TODO(?): HTML5 picture tag


f = open(entry_path + '/index.html', 'w')
f.write(str(content_soup.prettify()))
f.close()

sleep(1.3)


# Create new feed

# Maybe buiding a new feed from scretch using a template would be nicer but ...
# let's just modify the original one!

feed_soup = BeautifulSoup(r_feed.text, 'lxml-xml')

for e in feed_soup.findAll('item'):
entry_dir = get_valid_filename(e.link.text)
f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r')
content_tag = feed_soup.new_tag('content:encoded')
content_tag.string = CData(f_content.read())
e.append(content_tag)
f_content.close

f = open(public_path + '/' + output_filename, 'w')
f.write(str(feed_soup))
f.close()




# Let's actually fetch the stuff!

for feed in config['feeds']:
process_feed(feed['source'], feed['destination'])

+ 2
- 0
public/.gitignore Переглянути файл

@@ -0,0 +1,2 @@
*
!.gitignore

+ 4
- 0
requirements.txt Переглянути файл

@@ -0,0 +1,4 @@
beautifulsoup4==4.7.1
feedparser==5.2.1
requests==2.20.0
lxml==4.3.3

Завантаження…
Відмінити
Зберегти