summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog15
-rw-r--r--data/doc/feed2imap/examples/feed2imaprc26
-rw-r--r--lib/feed2imap/cache.rb57
-rw-r--r--lib/feed2imap/config.rb3
-rw-r--r--lib/feed2imap/feed2imap.rb4
-rw-r--r--manpages/feed2imaprc.xml3
6 files changed, 77 insertions, 31 deletions
diff --git a/ChangeLog b/ChangeLog
index 8209430..0879abc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,15 +1,20 @@
Feed2Imap 0.8 (XX/XX/2006)
============================
-* Fixed a small bug in the duplicate items handling which could have caused
- some items to be ignored if they had the same url but different content.
-* New always-new flag in the config file to consider all items as new (for
- feeds where items are wrongly marked as updated, e.g mediawiki feeds).
- See example configuration file for more information.
* Uses the http_proxy environment variable to determine the proxy server
if available. (fixes gna bug #5820, all credits go to Boyd Adamson
<boyd-adamson@usa.net>)
* Fixes flocking on Solaris (fixes gna bug #5819). Again, all credits go to
Boyd Adamson <boyd-adamson@usa.net>.
+* Rewrite of the "find updated and new items" code. It should work much better
+ now. Also, a debug-updated configuration variable was added to make it
+ easier to debug those issues.
+* New always-new flag in the config file to consider all items as new (for
+ feeds where items are wrongly marked as updated, e.g mediawiki feeds).
+ See example configuration file for more information (fixes Debian bug
+ #366878).
+* When disconnecting from the IMAP server, don't display an exception in
+ non-verbose mode if the "connection is reset by peer" (fixes Debian bug
+ #367282).
Feed2Imap 0.7 (17/02/2006)
============================
diff --git a/data/doc/feed2imap/examples/feed2imaprc b/data/doc/feed2imap/examples/feed2imaprc
index 35a9f6c..602e68b 100644
--- a/data/doc/feed2imap/examples/feed2imaprc
+++ b/data/doc/feed2imap/examples/feed2imaprc
@@ -1,13 +1,21 @@
-# name is the name of the feed (must be unique)
-# url is the HTTP[S] address where the feed has to be fetched
-# target is the IMAP URI where to put emails
-# min-frequency (in HOURS) is the minimum frequency with which this particular
-# feed will be fetched
+# Global options:
+# dumpdir: (for debugging purposes) directory where all fetched feeds will be
+# dumped.
+# debug-updated: (for debugging purposes) if true, display a lot of information
+# about the "updated-items" algorithm.
+#
+# Per-feed options:
+# name: name of the feed (must be unique)
+# url: HTTP[S] address where the feed has to be fetched
+# target: the IMAP URI where to put emails. Should start with imap:// for IMAP
+# and imaps:// for IMAPS.
+# min-frequency: (in HOURS) is the minimum frequency with which this particular
+# feed will be fetched
# disable: if set to something, the feed will be ignored
-# always-new: feed2imap tries to use a clever algorithm to determine whether an item
-# is new or has been updated. It doesn't work well with some web apps like
-# mediawiki. When this flag is enabled, all items which don't match exactly
-# a previously downloaded item are considered as new items.
+# always-new: feed2imap tries to use a clever algorithm to determine whether
+# an item is new or has been updated. It doesn't work well with some web apps
+# like mediawiki. When this flag is enabled, all items which don't match
+# exactly a previously downloaded item are considered as new items.
#
# If your login contains an @ character, replace it with %40. Other reserved
# characters can be escaped in the same way (see man ascii to get their code)
diff --git a/lib/feed2imap/cache.rb b/lib/feed2imap/cache.rb
index 9b5861f..a101785 100644
--- a/lib/feed2imap/cache.rb
+++ b/lib/feed2imap/cache.rb
@@ -17,20 +17,28 @@ along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
=end
+# debug mode
+$updateddebug = false
+
# This class manages a cache of items
# (items which have already been seen)
require 'digest/md5'
class ItemCache
- def initialize
+ def initialize(debug = false)
@channels = {}
@@cacheidx = 0
+ $updateddebug = debug
self
end
# Returns the really new items amongst items
def get_new_items(id, items, always_new = false)
+ if $updateddebug
+ puts "======================================================="
+ puts "GET_NEW_ITEMS FOR #{id}... (#{Time::now})"
+ end
@channels[id] ||= CachedChannel::new
return @channels[id].get_new_items(items, always_new)
end
@@ -92,7 +100,8 @@ end
class CachedChannel
# Size of the cache for each feed
- CACHESIZE = 50
+ # 100 items should be enough for everybody, even quite busy feeds
+ CACHESIZE = 100
attr_accessor :lastcheck, :items
@@ -113,8 +122,6 @@ class CachedChannel
# @nbnewitems is set by get_new_items, and is used to limit the number
# of (old) items serialized.
- UPDATEDDEBUG = false
-
# Returns the really new items amongst items
def get_new_items(items, always_new = false)
# save number of new items
@@ -124,6 +131,10 @@ class CachedChannel
updateditems = []
@itemstemp = @items
items.each { |i| i.cacheditem ||= CachedItem::new(i) }
+ if $updateddebug
+ puts "-------Items downloaded before dups removal (#{items.length}) :----------"
+ items.each { |i| puts "#{i.cacheditem.to_s}" }
+ end
# remove dups
dups = true
while dups
@@ -131,7 +142,7 @@ class CachedChannel
for i in 0...items.length do
for j in i+1...items.length do
if items[i].cacheditem == items[j].cacheditem
- if UPDATEDDEBUG
+ if $updateddebug
puts "## Removed duplicate #{items[j].cacheditem.to_s}"
end
items.delete_at(j)
@@ -143,10 +154,10 @@ class CachedChannel
end
end
# debug : dump interesting info to stdout.
- if UPDATEDDEBUG
- puts "-------Items downloaded :----------"
+ if $updateddebug
+ puts "-------Items downloaded after dups removal (#{items.length}) :----------"
items.each { |i| puts "#{i.cacheditem.to_s}" }
- puts "-------Items already there :----------"
+ puts "-------Items already there (#{@items.length}) :----------"
@items.each { |i| puts "#{i.to_s}" }
puts "Items always considered as new: #{always_new.to_s}"
end
@@ -168,7 +179,7 @@ class CachedChannel
# Try to find an updated item
@items.each do |j|
# Do we need a better heuristic ?
- if i.link and i.link == j.link
+ if j.is_ancestor_of(i)
i.cacheditem.index = j.index
i.cacheditem.updated = true
updateditems.push(i)
@@ -187,7 +198,7 @@ class CachedChannel
# add i.cacheditem to @itemstemp
@itemstemp.unshift(i.cacheditem)
end
- if UPDATEDDEBUG
+ if $updateddebug
puts "-------New items :----------"
newitems.each { |i| puts "#{i.cacheditem.to_s}" }
puts "-------Updated items :----------"
@@ -200,6 +211,9 @@ class CachedChannel
# too old items must be dropped
n = @nbnewitems > CACHESIZE ? @nbnewitems : CACHESIZE
@items = @itemstemp[0..n]
+ if $updateddebug
+ puts "Committing: new items: #{@nbnewitems} / items kept: #{@items.length}"
+ end
@itemstemp = []
self
end
@@ -212,13 +226,15 @@ end
# This class is the only thing kept in the cache
class CachedItem
- attr_reader :title, :link, :hash
+ attr_reader :title, :link, :creator, :date, :hash
attr_accessor :index
attr_accessor :updated
def initialize(item)
@title = item.title
@link = item.link
+ @date = item.date
+ @creator = item.creator
if item.content.nil?
@hash = nil
else
@@ -227,14 +243,29 @@ class CachedItem
end
def ==(other)
- @title == other.title and @link == other.link and @hash == other.hash
+ if $updateddebug and @title =~ /e325/ and other.title =~ /e325/
+ puts "Comparing #{self.to_s} and #{other.to_s}:"
+ puts "Title: #{@title == other.title}"
+ puts "Link: #{@link == other.link}"
+ puts "Creator: #{@creator == other.creator}"
+ puts "Date: #{@date == other.date}"
+ puts "Hash: #{@hash == other.hash}"
+ end
+ @title == other.title and @link == other.link and
+ (@creator.nil? or other.creator.nil? or @creator == other.creator) and
+ (@date.nil? or other.date.nil? or @date == other.date) and @hash == other.hash
end
def create_index
@index = ItemCache.getindex
end
+ def is_ancestor_of(other)
+ (@link and other.link and @link == other.link) and
+ ((@creator and other.creator and @creator == other.creator) or (@creator.nil?))
+ end
+
def to_s
- "\"#{@title}\" #{@link} #{@hash}"
+ "\"#{@title}\" #{@creator}/#{@date} #{@link} #{@hash}"
end
end
diff --git a/lib/feed2imap/config.rb b/lib/feed2imap/config.rb
index 1e13a04..38a1faa 100644
--- a/lib/feed2imap/config.rb
+++ b/lib/feed2imap/config.rb
@@ -26,7 +26,7 @@ DEFCACHE = ENV['HOME'] + '/.feed2imap.cache'
# Feed2imap configuration
class F2IConfig
- attr_reader :imap_accounts, :cache, :feeds, :dumpdir
+ attr_reader :imap_accounts, :cache, :feeds, :dumpdir, :updateddebug
# Load the configuration from the IO stream
# TODO should do some sanity check on the data read.
@@ -36,6 +36,7 @@ class F2IConfig
@dumpdir = @conf['dumpdir'] || nil
@conf['feeds'] ||= []
@feeds = []
+ @updateddebug = (@conf['debug-updated'] and @conf['debug-updated'] != 'false')
@imap_accounts = ImapAccounts::new
@conf['feeds'].each do |f|
if f['disable'].nil?
diff --git a/lib/feed2imap/feed2imap.rb b/lib/feed2imap/feed2imap.rb
index a40dd66..fcbb357 100644
--- a/lib/feed2imap/feed2imap.rb
+++ b/lib/feed2imap/feed2imap.rb
@@ -61,7 +61,7 @@ class Feed2Imap
end
# init cache
@logger.info('Initializing cache')
- @cache = ItemCache::new
+ @cache = ItemCache::new(@config.updateddebug)
if not File::exist?(@config.cache + '.lock')
f = File::new(@config.cache + '.lock', 'w')
f.close
@@ -187,7 +187,7 @@ class Feed2Imap
begin
ac.disconnect
rescue
- @logger.fatal("Exception caught while closing connection to #{ac.to_s}: #{$!}")
+ @logger.info("Exception caught while closing connection to #{ac.to_s}: #{$!}")
end
end
end
diff --git a/manpages/feed2imaprc.xml b/manpages/feed2imaprc.xml
index 6731f6e..7643d3d 100644
--- a/manpages/feed2imaprc.xml
+++ b/manpages/feed2imaprc.xml
@@ -43,7 +43,8 @@
</refsect1>
<refsect1>
<title>BUGS</title>
- <para>This manpage should probably give more details.</para>
+ <para>This manpage should probably give more details. However, the example configuration file is
+very well documented.</para>
</refsect1>
<refsect1>
<title>SEE ALSO</title>