more documentation and comments

6 years ago · 2b485e704e
--- a/README.md
+++ b/README.md
@@ -55,9 +55,10 @@ news pages
 * add cronjob for `cron.sh`: `crontab -e`
  * `*/5 * * * * /absolute/path/to/cron.sh  >> /path/to/logfile 2>&1`
 * setup your webserver:
  * let your webserver somehow point to the `feeds` directory.
  * let your webserver somehow point to the `public/feeds` directory.
    You should protect the http path with a basic authentication.
  * let the `assets_url` specified in the config point to the `assets` directory.
  * let the `assets_url` you specified in the config earlier point to the
    `public/assets` directory.
 * After running the script the first time, your desired feed is available at
  `base_url/destination` (e.g. `https://yourdomain.tld/some-url/newspaper.xml`)
--- a/feedcake.py
+++ b/feedcake.py
@@ -32,6 +32,7 @@ except:
    print(timestamp(), "ERROR: Config file not found or invalid!")
    sys.exit(1)
 # File location of feeds and assets are in directories next to the script
 public_path = filedir + '/public'
 assets_path = public_path + '/assets'
 feeds_path = public_path + '/feeds'
@@ -70,18 +71,19 @@ def download_image(url, entry_dir, filename):
            for chunk in response.iter_content(1024):
                f.write(chunk)
 # process the feed entries, specified in the config
 def process_feed(obj):
    feed_url = obj['source']
    output_filename = obj['destination']
    print(timestamp(), 'Updating:', obj['destination'])
    # Get the feed
    # Step 1: Get the feed
    r_feed = requests.get(feed_url, headers=requestheaders)
    # TODO: exceptions.(what if 404 or whatever?)
    # Store data of new articles
    # Step 2: Scrape and store data of new articles
    for entry in feedparser.parse(r_feed.text).entries:
        entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
        entry_path = assets_path + '/'+ entry_dir
@@ -108,7 +110,7 @@ def process_feed(obj):
                content_soup.div.append(article_headline)
                article_body = online_soup.find('div', attrs={'class': 'story-content'})
                content_soup.div.append(article_body)
                # Add a link to original article
                article_source = content_soup.new_tag('a', href=entry.link)
                article_source['class'] = 'source'
@@ -145,7 +147,6 @@ def process_feed(obj):
                article_source_p.append(article_source)
                content_soup.div.append(article_source_p)
            # create directory for storing and serving html and images
            os.makedirs(entry_path)
@@ -178,15 +179,16 @@ def process_feed(obj):
            # TODO(?): HTML5 picture tag
            # Save HTML
            f = open(entry_path + '/index.html', 'w')
            f.write(str(content_soup))
            f.close()
            # Wait a bit
            time.sleep(1.3)
    # Create new feed
    # Step 3: Create 'new' feed
    # Maybe buiding a new feed from scretch using a template would be nicer but ...
    # let's just modify the original one!
@@ -201,6 +203,7 @@ def process_feed(obj):
                e.extract()
                print(timestamp(), 'Exclude: ', e.title.text, '->', matches)
    # Add content
    for e in feed_soup.findAll('item'):
        entry_dir = get_valid_filename(e.link.text)
        f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r')