浏览代码

log timestamps, editable request-headers, README

master
Andreas Demmelbauer 6 年前
父节点
当前提交
061e57323c
共有 3 个文件被更改,包括 29 次插入32 次删除
  1. +1
    -1
      README.md
  2. +3
    -0
      config.example.json
  3. +25
    -31
      feedcake.py

+ 1
- 1
README.md 查看文件

@@ -51,7 +51,7 @@ news pages
* edit `cron.sh` * edit `cron.sh`
* make `cron.sh` executable: `chmod +x cron.sh` * make `cron.sh` executable: `chmod +x cron.sh`
* add cronjob for `cron.sh`: `crontab -e` * add cronjob for `cron.sh`: `crontab -e`
* `*/5 * * * * /absolute/path/to/cron.sh > /path/to/logfile 2>&1`
* `*/5 * * * * /absolute/path/to/cron.sh >> /path/to/logfile 2>&1`
* setup your webserver: * setup your webserver:
* let your webserver somehow point to the `feeds` directory. * let your webserver somehow point to the `feeds` directory.
You should protect the http path with a basic authentication. You should protect the http path with a basic authentication.


+ 3
- 0
config.example.json 查看文件

@@ -1,5 +1,8 @@
{ {
"assets_url" : "https://yourdomain.tld/some-url", "assets_url" : "https://yourdomain.tld/some-url",
"request_headers": {
"user-agent" : "Mozilla/5.0 (compatible; Feedcake/0.1 friendly, non-profit newsfeed bot)"
},
"feeds" : [ "feeds" : [
{ {
"source" : "https://a.newspaper.tld/news.xml", "source" : "https://a.newspaper.tld/news.xml",


+ 25
- 31
feedcake.py 查看文件

@@ -5,27 +5,31 @@ import sys
import re import re
import hashlib import hashlib
import json import json
from time import sleep
import time
import feedparser import feedparser
import requests import requests
from bs4 import BeautifulSoup, Comment from bs4 import BeautifulSoup, Comment
from bs4.element import CData from bs4.element import CData




def timestamp():
return '[' + time.strftime("%d/%b/%Y:%H:%M:%S %z", time.localtime()) + ']'


# default config location is a 'config.json' next to the script. # default config location is a 'config.json' next to the script.
try: try:
filedir = os.path.dirname(os.path.abspath(__file__)) filedir = os.path.dirname(os.path.abspath(__file__))
if len(sys.argv) < 2: if len(sys.argv) < 2:
configpath = filedir+'/config.json' configpath = filedir+'/config.json'
print("Using default config location: ", configpath)
print(timestamp(), "Using default config location: ", configpath)
config = json.load(open(configpath)) config = json.load(open(configpath))
else: else:
configpath = sys.argv[1] configpath = sys.argv[1]
config = json.load(open(configpath)) config = json.load(open(configpath))


except: except:
print("Problem reading config file: ", configpath)
print("ERROR: Config file not found or invalid!")
print(timestamp(), "Problem reading config file: ", configpath)
print(timestamp(), "ERROR: Config file not found or invalid!")
sys.exit(1) sys.exit(1)


public_path = filedir + '/public' public_path = filedir + '/public'
@@ -35,19 +39,7 @@ feeds_path = public_path + '/feeds'
# e.g. https://example.com/some-string # e.g. https://example.com/some-string
assets_url = config['assets_url'] assets_url = config['assets_url']





# "I'm a robot which promises you clicks and $ ... Give me ALL your content!"
requestheaders = {
'user-'+'age'+'nt' :
'Mo' + 'zill' + 'a/5.' + '0 (' + 'comp' + 'ati' + 'ble; '+'Go'
+ 'og'+'le'+ 'bo' + 't/' + '2.1; +http' + '://www.' + 'go'
+ 'og'+ 'le'+'.com/'+'bo'+'t.html)'
}



requestheaders = config['request_headers']


# need filname safe strings for storing images along html files # need filname safe strings for storing images along html files
def get_valid_filename(s): def get_valid_filename(s):
@@ -82,7 +74,7 @@ def process_feed(obj):
feed_url = obj['source'] feed_url = obj['source']
output_filename = obj['destination'] output_filename = obj['destination']


print('Updating:', obj['destination'])
print(timestamp(), 'Updating:', obj['destination'])


# Get the feed # Get the feed
r_feed = requests.get(feed_url, headers=requestheaders) r_feed = requests.get(feed_url, headers=requestheaders)
@@ -94,7 +86,7 @@ def process_feed(obj):
entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/ entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
entry_path = assets_path + '/'+ entry_dir entry_path = assets_path + '/'+ entry_dir
if not os.path.exists(entry_path): if not os.path.exists(entry_path):
print('New item:', entry.link)
print(timestamp(), 'New item:', entry.link)
r = requests.get(entry.link.split('?')[0], headers=requestheaders) r = requests.get(entry.link.split('?')[0], headers=requestheaders)


online_soup = BeautifulSoup(r.text, 'html.parser') online_soup = BeautifulSoup(r.text, 'html.parser')
@@ -106,7 +98,7 @@ def process_feed(obj):
element.extract() element.extract()


# domain and path specific rules # domain and path specific rules
# ... split strings for (very simple) ob+fu+sca+tion
# ... ob+fu+sca+tion for seo


if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'): if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'):
if entry.date: if entry.date:
@@ -116,10 +108,10 @@ def process_feed(obj):
content_soup.div.append(article_headline) content_soup.div.append(article_headline)
article_body = online_soup.find('div', attrs={'class': 'story-content'}) article_body = online_soup.find('div', attrs={'class': 'story-content'})
content_soup.div.append(article_body) content_soup.div.append(article_body)
article_link = content_soup.new_tag('a', href=entry.link)
article_link['class'] = 'source'
article_link.string = 'Quelle (' + entry.link + ')'
content_soup.div.append(article_link)
article_source = content_soup.new_tag('a', href=entry.link)
article_source['class'] = 'source'
article_source.string = 'Quelle: ' + entry.link
content_soup.div.append(article_source)


if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :) if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :)
if entry.published: if entry.published:
@@ -142,10 +134,12 @@ def process_feed(obj):
article_comments_p.append(article_comments_link) article_comments_p.append(article_comments_link)
content_soup.div.append(article_comments_p) content_soup.div.append(article_comments_p)


article_link = content_soup.new_tag('a', href=entry.link.split('?')[0])
article_link['class'] = 'source'
article_link.string = 'Quelle: ' + entry.link.split('?')[0]
content_soup.div.append(article_link)
article_source = content_soup.new_tag('a', href=entry.link.split('?')[0])
article_source['class'] = 'source'
article_source.string = 'Quelle: ' + entry.link.split('?')[0]
article_source_p = content_soup.new_tag('p')
article_source_p.append(article_source)
content_soup.div.append(article_source_p)




# create directory for storing and serving html and images # create directory for storing and serving html and images
@@ -185,7 +179,7 @@ def process_feed(obj):
f.write(str(content_soup)) f.write(str(content_soup))
f.close() f.close()


sleep(1.3)
time.sleep(1.3)




# Create new feed # Create new feed
@@ -201,7 +195,7 @@ def process_feed(obj):
matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()] matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()]
if len(matches) > 0: if len(matches) > 0:
e.extract() e.extract()
print('Exclude: ', e.title.text, '->', matches)
print(timestamp(), 'Exclude: ', e.title.text, '->', matches)


for e in feed_soup.findAll('item'): for e in feed_soup.findAll('item'):
entry_dir = get_valid_filename(e.link.text) entry_dir = get_valid_filename(e.link.text)
@@ -215,7 +209,7 @@ def process_feed(obj):
os.makedirs(feeds_path, exist_ok=True) os.makedirs(feeds_path, exist_ok=True)


f = open(feeds_path + '/' + output_filename, 'w') f = open(feeds_path + '/' + output_filename, 'w')
print('Done!')
print(timestamp(), 'Done!')
f.write(str(feed_soup)) f.write(str(feed_soup))
f.close() f.close()




正在加载...
取消
保存