mf2: support alt text in <img> tags

snarfed/bridgy#756
snarfed · Jul 19, 2018 · 05a7818 · 05a7818
1 parent 96ae47c
commit 05a7818
Show file tree

Hide file tree

Showing 9 changed files with 86 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -320,6 +320,9 @@ Changelog
   * Improve GraphQL support for comments and users.
 * Atom:
   * Shorten and ellipsize feed title when necessary ([#144](https://github.com/snarfed/granary/issues/144)).
+* microformats2:
+  * Upgrade mf2py to improve a few things like [implied p-name detection](http://microformats.org/wiki/microformats2-implied-properties) and whitespace handling ([#142](https://github.com/snarfed/granary/issues/142), fixes [#145](https://github.com/snarfed/granary/issues/145), [snarfed/bridgy#756](https://github.com/snarfed/bridgy/issues/756), [snarfed/bridgy#828](https://github.com/snarfed/bridgy/issues/828)).
+  * Support `alt` attribute in `<img>` tags ([snarfed/bridgy#756](https://github.com/snarfed/bridgy/issues/756)).
 
 ### 1.12 - 2018-03-24
 * Add Python 3 support! Granary now requires either Python 2.7+ or Python 3.3+.

diff --git a/app.py b/app.py
@@ -142,7 +142,7 @@ def get(self):
 
     mf2 = None
     if input == 'html':
-      mf2 = mf2py.parse(doc=body, url=url)
+      mf2 = mf2py.parse(doc=body, url=url, img_with_alt=True)
     elif input in ('mf2-json', 'json-mf2'):
       mf2 = body_json
       mf2.setdefault('rels', {})  # mf2util expects rels
@@ -154,7 +154,7 @@ def fetch_mf2_func(url):
         if util.domain_or_parent_in(urlparse.urlparse(url).netloc, SILO_DOMAINS):
           return {'items': [{'type': ['h-card'], 'properties': {'url': [url]}}]}
         _, doc = self._fetch(url)
-        return mf2py.parse(doc=doc, url=url)
+        return mf2py.parse(doc=doc, url=url, img_with_alt=True)
 
       try:
         actor = microformats2.find_author(mf2, fetch_mf2_func=fetch_mf2_func)

diff --git a/granary/atom.py b/granary/atom.py
@@ -329,9 +329,9 @@ def html_to_atom(html, url=None, fetch_author=False, reader=True):
   if fetch_author:
     assert url, 'fetch_author=True requires url!'
 
-  parsed = mf2py.parse(doc=html, url=url)
+  parsed = mf2py.parse(doc=html, url=url, img_with_alt=True)
   actor = microformats2.find_author(
-    parsed, fetch_mf2_func=lambda url: mf2py.parse(url=url))
+    parsed, fetch_mf2_func=lambda url: mf2py.parse(url=url, img_with_alt=True))
 
   return activities_to_atom(
     microformats2.html_to_activities(html, url, actor),

diff --git a/granary/flickr.py b/granary/flickr.py
@@ -432,7 +432,7 @@ def user_to_actor(self, resp):
     if profile_url:
       try:
         resp = util.urlopen(profile_url)
-        profile_json = mf2py.parse(doc=resp, url=profile_url)
+        profile_json = mf2py.parse(doc=resp, url=profile_url, img_with_alt=True)
         # personal site is likely the first non-flickr url
         urls = profile_json.get('rels', {}).get('me', [])
         obj['urls'] = [{'value': u} for u in urls]

diff --git a/granary/microformats2.py b/granary/microformats2.py
@@ -24,6 +24,7 @@
   dedupe_urls,
   get_first,
   get_list,
+  get_url,
   get_urls,
   uniquify,
 )
@@ -224,8 +225,7 @@ def object_to_json(obj, trim_nulls=True, entry_class='h-entry',
       'summary': [summary],
       'url': (list(object_urls(obj) or object_urls(primary)) +
               obj.get('upstreamDuplicates', [])),
-      'photo': dedupe_urls(get_urls(attachments, 'image', 'image') +
-                           get_urls(primary, 'image')),
+      # photo is special cased below, to handle alt
       'video': dedupe_urls(get_urls(attachments, 'video', 'stream') +
                            get_urls(primary, 'stream')),
       'audio': get_urls(attachments, 'audio', 'stream'),
@@ -252,6 +252,16 @@ def object_to_json(obj, trim_nulls=True, entry_class='h-entry',
                  for a in attachments['note'] + attachments['article']]
   }
 
+  # photos, including alt text
+  photo_urls = set()
+  ret['properties']['photo'] = []
+  for image in get_list(attachments, 'image') + [primary]:
+    for url in get_urls(image, 'image'):
+      if url and url not in photo_urls:
+        photo_urls.add(url)
+        name = get_first(image, 'image', {}).get('displayName')
+        ret['properties']['photo'].append({'value': url, 'alt': name} if name else url)
+
   # hashtags and person tags
   if obj_type == 'tag':
     ret['properties']['tag-of'] = util.get_urls(obj, 'target')
@@ -345,7 +355,7 @@ def json_to_object(mf2, actor=None, fetch_mf2=False):
     # the author h-card may be on another page. run full authorship algorithm:
     # https://indieweb.org/authorship
     def fetch(url):
-      return mf2py.parse(util.requests_get(url).text, url=url)
+      return mf2py.parse(util.requests_get(url).text, url=url, img_with_alt=True)
     author = mf2util.find_author(
       {'items': [mf2]}, hentry=mf2, fetch_mf2_func=fetch if fetch_mf2 else None)
     if author:
@@ -385,10 +395,9 @@ def fetch(url):
     if re.match(r'^https?://github.com/[^/]+/[^/]+(/issues)?/?$', url):
       as_type = 'issue'
 
-  def absolute_urls(prop):
-    return [url for url in get_string_urls(props.get(prop, []))
-            # filter out relative and invalid URLs (mf2py gives absolute urls)
-            if urllib.parse.urlparse(url).netloc]
+  def is_absolute(url):
+    """Filter out relative and invalid URLs (mf2py gives absolute urls)."""
+    return urllib.parse.urlparse(url).netloc
 
   urls = props.get('url') and get_string_urls(props.get('url'))
 
@@ -416,9 +425,8 @@ def absolute_urls(prop):
     'content': get_html(prop.get('content')),
     'url': urls[0] if urls else None,
     'urls': [{'value': u} for u in urls] if urls and len(urls) > 1 else None,
-    'image': [{'url': url} for url in
-              dedupe_urls(absolute_urls('photo') + absolute_urls('featured'))],
-    'stream': [{'url': url} for url in absolute_urls('video')],
+    # image is special cased below, to handle alt
+    'stream': [{'url': url} for url in get_string_urls(props.get('video', []))],
     'location': json_to_object(prop.get('location')),
     'replies': {'items': [json_to_object(c) for c in props.get('comment', [])]},
     'tags': [{'objectType': 'hashtag', 'displayName': cat}
@@ -428,6 +436,20 @@ def absolute_urls(prop):
     'attachments': attachments,
   }
 
+  # images, including alt text
+  photo_urls = set()
+  obj['image'] = []
+  for photo in props.get('photo', []) + props.get('featured', []):
+    url = photo
+    alt = None
+    if isinstance(photo, dict):
+      photo = photo.get('properties') or photo
+      url = get_first(photo, 'value') or get_first(photo, 'url')
+      alt = get_first(photo, 'alt')
+    if url and url not in photo_urls and is_absolute(url):
+      photo_urls.add(url)
+      obj['image'].append({'url': url, 'displayName': alt})
+
   # mf2util uses the indieweb/mf2 location algorithm to collect location properties.
   interpreted = mf2util.interpret({'items': [mf2]}, None)
   if interpreted:
@@ -485,7 +507,7 @@ def html_to_activities(html, url=None, actor=None):
   Returns:
     list of ActivityStreams activity dicts
   """
-  parsed = mf2py.parse(doc=html, url=url)
+  parsed = mf2py.parse(doc=html, url=url, img_with_alt=True)
   hfeed = mf2util.find_first_entry(parsed, ['h-feed'])
   items = hfeed.get('children', []) if hfeed else parsed.get('items', [])
 
@@ -925,10 +947,13 @@ def find_author(parsed, **kwargs):
   """
   author = mf2util.find_author(parsed, 'http://123', **kwargs)
   if author:
+    photo = author.get('photo')
+    if isinstance(photo, dict):
+      photo = photo.get('url') or photo.get('value')
     return {
       'displayName': author.get('name'),
       'url': author.get('url'),
-      'image': {'url': author.get('photo')},
+      'image': {'url': photo},
     }
 
 
@@ -1028,12 +1053,16 @@ def img(src, alt=''):
   """Returns an <img> string with the given src, class, and alt.
 
   Args:
-    src: string, url of the image
+    src: string url or dict with value and (optionally) alt
     alt: string, alt attribute value, or None
 
   Returns:
     string
   """
+  if isinstance(src, dict):
+    assert not alt
+    alt = src.get('alt', '')
+    src = src.get('value')
   return '<img class="u-photo" src="%s" alt=%s />' % (
       src, xml.sax.saxutils.quoteattr(alt or ''))
 

diff --git a/granary/test/testdata/article_with_photo_with_alt.as.json b/granary/test/testdata/article_with_photo_with_alt.as.json
@@ -0,0 +1,9 @@
+{
+  "objectType": "article",
+  "displayName": "article abc",
+  "content": "foo bar",
+  "image": [{
+    "url": "http://pic/ture.jpg",
+    "displayName": "my alt text"
+  }]
+}
diff --git a/granary/test/testdata/article_with_photo_with_alt.mf2.html b/granary/test/testdata/article_with_photo_with_alt.mf2.html
@@ -0,0 +1,12 @@
+<article class="h-entry">
+  <span class="p-uid"></span>
+
+  <span class="p-name">article abc</span>
+  <div class="e-content">
+
+  foo bar
+  </div>
+
+  <img class="u-photo" src="http://pic/ture.jpg" alt="my alt text" />
+
+</article>
diff --git a/granary/test/testdata/article_with_photo_with_alt.mf2.json b/granary/test/testdata/article_with_photo_with_alt.mf2.json
@@ -0,0 +1,14 @@
+{
+  "type": ["h-entry"],
+  "properties": {
+    "name": ["article abc"],
+    "content": [{
+      "value": "foo bar",
+      "html": "foo bar"
+    }],
+    "photo": [{
+      "value": "http://pic/ture.jpg",
+      "alt": "my alt text"
+    }]
+  }
+}
diff --git a/granary/test/testdata/note_with_composite_photo.mf2.json b/granary/test/testdata/note_with_composite_photo.mf2.json
@@ -6,8 +6,7 @@
             "properties": {
                 "name": ["The Photo Caption"],
                 "url": ["https://ben.thatmustbe.me/static/img.jpg"]
-            },
-            "value": "https://ben.thatmustbe.me/static/img.jpgThe Photo Caption"
+            }
         }],
         "name": [""]
     }