log timestamps, editable request-headers, README

6 years ago · 061e57323c
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@ news pages
 * edit `cron.sh`
 * make `cron.sh` executable: `chmod +x cron.sh`
 * add cronjob for `cron.sh`: `crontab -e`
  * `*/5 * * * * /absolute/path/to/cron.sh  > /path/to/logfile 2>&1`
  * `*/5 * * * * /absolute/path/to/cron.sh  >> /path/to/logfile 2>&1`
 * setup your webserver:
  * let your webserver somehow point to the `feeds` directory.
    You should protect the http path with a basic authentication.
--- a/config.example.json
+++ b/config.example.json
@@ -1,5 +1,8 @@
 {
  "assets_url" : "https://yourdomain.tld/some-url",
  "request_headers": {
      "user-agent" : "Mozilla/5.0 (compatible; Feedcake/0.1 friendly, non-profit newsfeed bot)"
  },
  "feeds" : [
    {
      "source" : "https://a.newspaper.tld/news.xml",
--- a/feedcake.py
+++ b/feedcake.py
@@ -5,27 +5,31 @@ import sys
 import re
 import hashlib
 import json
 from time import sleep
 import time
 import feedparser
 import requests
 from bs4 import BeautifulSoup, Comment
 from bs4.element import CData


 def timestamp():
    return '[' + time.strftime("%d/%b/%Y:%H:%M:%S %z", time.localtime()) + ']'


 # default config location is a 'config.json' next to the script.
 try:
    filedir = os.path.dirname(os.path.abspath(__file__))
    if len(sys.argv) < 2:
        configpath = filedir+'/config.json'
        print("Using default config location: ", configpath)
        print(timestamp(), "Using default config location: ", configpath)
        config = json.load(open(configpath))
    else:
        configpath = sys.argv[1]
        config = json.load(open(configpath))

 except:
    print("Problem reading config file: ", configpath)
    print("ERROR: Config file not found or invalid!")
    print(timestamp(), "Problem reading config file: ", configpath)
    print(timestamp(), "ERROR: Config file not found or invalid!")
    sys.exit(1)

 public_path = filedir + '/public'
@@ -35,19 +39,7 @@ feeds_path = public_path + '/feeds'
 # e.g. https://example.com/some-string
 assets_url = config['assets_url']




 # "I'm a robot which promises you clicks and $ ... Give me ALL your content!"
 requestheaders = {
    'user-'+'age'+'nt' :
    'Mo' + 'zill' + 'a/5.' + '0 (' + 'comp' + 'ati' + 'ble; '+'Go'
    + 'og'+'le'+ 'bo' + 't/' + '2.1; +http' + '://www.' + 'go'
    + 'og'+ 'le'+'.com/'+'bo'+'t.html)'
 }



 requestheaders = config['request_headers']

 # need filname safe strings for storing images along html files
 def get_valid_filename(s):
@@ -82,7 +74,7 @@ def process_feed(obj):
    feed_url = obj['source']
    output_filename = obj['destination']

    print('Updating:', obj['destination'])
    print(timestamp(), 'Updating:', obj['destination'])

    # Get the feed
    r_feed = requests.get(feed_url, headers=requestheaders)
@@ -94,7 +86,7 @@ def process_feed(obj):
        entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
        entry_path = assets_path + '/'+ entry_dir
        if not os.path.exists(entry_path):
            print('New item:', entry.link)
            print(timestamp(), 'New item:', entry.link)
            r = requests.get(entry.link.split('?')[0], headers=requestheaders)

            online_soup = BeautifulSoup(r.text, 'html.parser')
@@ -106,7 +98,7 @@ def process_feed(obj):
                element.extract()

            # domain and path specific rules
            # ... split strings for (very simple) ob+fu+sca+tion
            # ... ob+fu+sca+tion for seo

            if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'):
                if entry.date:
@@ -116,10 +108,10 @@ def process_feed(obj):
                content_soup.div.append(article_headline)
                article_body = online_soup.find('div', attrs={'class': 'story-content'})
                content_soup.div.append(article_body)
                article_link = content_soup.new_tag('a', href=entry.link)
                article_link['class'] = 'source'
                article_link.string = 'Quelle (' + entry.link + ')'
                content_soup.div.append(article_link)
                article_source = content_soup.new_tag('a', href=entry.link)
                article_source['class'] = 'source'
                article_source.string = 'Quelle: ' + entry.link
                content_soup.div.append(article_source)

            if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :)
                if entry.published:
@@ -142,10 +134,12 @@ def process_feed(obj):
                article_comments_p.append(article_comments_link)
                content_soup.div.append(article_comments_p)

                article_link = content_soup.new_tag('a', href=entry.link.split('?')[0])
                article_link['class'] = 'source'
                article_link.string = 'Quelle: ' + entry.link.split('?')[0]
                content_soup.div.append(article_link)
                article_source = content_soup.new_tag('a', href=entry.link.split('?')[0])
                article_source['class'] = 'source'
                article_source.string = 'Quelle: ' + entry.link.split('?')[0]
                article_source_p = content_soup.new_tag('p')
                article_source_p.append(article_source)
                content_soup.div.append(article_source_p)


            # create directory for storing and serving html and images
@@ -185,7 +179,7 @@ def process_feed(obj):
            f.write(str(content_soup))
            f.close()

            sleep(1.3)
            time.sleep(1.3)


    # Create new feed
@@ -201,7 +195,7 @@ def process_feed(obj):
            matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()]
            if len(matches) > 0:
                e.extract()
                print('Exclude: ', e.title.text, '->', matches)
                print(timestamp(), 'Exclude: ', e.title.text, '->', matches)

    for e in feed_soup.findAll('item'):
        entry_dir = get_valid_filename(e.link.text)
@@ -215,7 +209,7 @@ def process_feed(obj):
    os.makedirs(feeds_path, exist_ok=True)

    f = open(feeds_path + '/' + output_filename, 'w')
    print('Done!')
    print(timestamp(), 'Done!')
    f.write(str(feed_soup))
    f.close()