Remove a and img tags from msg

While the notification spec allows tags like <a href="...">...</a> and
<img src="..." alt="...">, pango cannot parse these tags and therefore
these tags should be removed before passed to pango.

Also the method notification_extract_markup_urls is not needed anymore,
as markup_strip_a can return URLs optionally.

This implies, that URL replacement is now indicated via show_indicators
for URLs and the dmenu string is in the format of
'[text between a tags] URL\n'. This is similarly handled for images,
too.
This commit is contained in:
Benedikt Heine 2017-11-03 02:02:44 +01:00
parent 4bfae81f18
commit acd8be51ab
5 changed files with 295 additions and 48 deletions

View File

@ -4,6 +4,8 @@
#include <assert.h>
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
#include "settings.h"
#include "utils.h"
@ -44,6 +46,178 @@ static char *markup_br2nl(char *str)
return str;
}
/*
* Remove HTML hyperlinks of a string.
*
* @str: The string to replace a tags
* @urls: (nullable): If any href-attributes found, an '\n' concatenated
* string of the URLs in format '[<text between tags>] <href>'
*/
void markup_strip_a(char **str, char **urls)
{
char *tag1 = NULL;
if (urls)
*urls = NULL;
while ((tag1 = strstr(*str, "<a"))) {
// use href=" as stated in the notification spec
char *href = strstr(tag1, "href=\"");
char *tag1_end = strstr(tag1, ">");
char *tag2 = strstr(tag1, "</a>");
// the tag is broken, ignore it
if (!tag1_end) {
fprintf(stderr,
"WARNING: Given link is broken: '%s'\n",
tag1);
string_replace_at(*str, tag1-*str, strlen(tag1), "");
break;
}
if (tag2 && tag2 < tag1_end) {
int repl_len = (tag2 - tag1) + strlen("</a>");
fprintf(stderr,
"WARNING: Given link is broken: '%.*s.'\n",
repl_len, tag1);
string_replace_at(*str, tag1-*str, repl_len, "");
break;
}
// search contents of href attribute
char *plain_url = NULL;
if (href && href < tag1_end) {
// shift href to the actual begin of the value
href = href+6;
const char *quote = strstr(href, "\"");
if (quote && quote < tag1_end) {
plain_url = g_strndup(href, quote-href);
}
}
// text between a tags
int text_len;
if (tag2)
text_len = tag2 - (tag1_end+1);
else
text_len = strlen(tag1_end+1);
char *text = g_strndup(tag1_end+1, text_len);
int repl_len = text_len + (tag1_end-tag1) + 1;
repl_len += tag2 ? strlen("</a>") : 0;
*str = string_replace_at(*str, tag1-*str, repl_len, text);
// if there had been a href attribute,
// add it to the URLs
if (plain_url && urls) {
text = string_replace_all("]", "", text);
text = string_replace_all("[", "", text);
char *url = g_strdup_printf("[%s] %s", text, plain_url);
*urls = string_append(*urls, url, "\n");
g_free(url);
}
g_free(plain_url);
g_free(text);
}
}
/*
* Remove img-tags of a string. If alt attribute given, use this as replacement.
*
* @str: The string to replace img tags
* @urls: (nullable): If any src-attributes found, an '\n' concatenated string of
* the URLs in format '[<alt>] <src>'
*/
void markup_strip_img(char **str, char **urls)
{
const char *start = *str;
if (urls)
*urls = NULL;
while ((start = strstr(*str, "<img"))) {
const char *end = strstr(start, ">");
// the tag is broken, ignore it
if (!end) {
fprintf(stderr, "WARNING: Given image is broken: '%s'\n", start);
string_replace_at(*str, start-*str, strlen(start), "");
break;
}
// use attribute=" as stated in the notification spec
const char *alt_s = strstr(start, "alt=\"");
const char *src_s = strstr(start, "src=\"");
char *text_alt = NULL;
char *text_src = NULL;
const char *src_e = NULL, *alt_e = NULL;
if (alt_s)
alt_e = strstr(alt_s + strlen("alt=\""), "\"");
if (src_s)
src_e = strstr(src_s + strlen("src=\""), "\"");
// Move pointer to the actual start
alt_s = alt_s ? alt_s + strlen("alt=\"") : NULL;
src_s = src_s ? src_s + strlen("src=\"") : NULL;
/* check if alt and src attribute are given
* If both given, check the alignment of all pointers */
if ( alt_s && alt_e
&& src_s && src_e
&& ( (alt_s < src_s && alt_e < src_s-strlen("src=\"") && src_e < end)
||(src_s < alt_s && src_e < alt_s-strlen("alt=\"") && alt_e < end)) ) {
text_alt = g_strndup(alt_s, alt_e-alt_s);
text_src = g_strndup(src_s, src_e-src_s);
/* check if single valid alt attribute is available */
} else if (alt_s && alt_e && alt_e < end && (!src_s || src_s < alt_s || alt_e < src_s - strlen("src=\""))) {
text_alt = g_strndup(alt_s, alt_e-alt_s);
/* check if single valid src attribute is available */
} else if (src_s && src_e && src_e < end && (!alt_s || alt_s < src_s || src_e < alt_s - strlen("alt=\""))) {
text_src = g_strndup(src_s, src_e-src_s);
} else {
fprintf(stderr,
"WARNING: Given image argument is broken: '%.*s'\n",
(int)(end-start), start);
}
// replacement text for alt
int repl_len = end - start + 1;
if (!text_alt)
text_alt = g_strdup("[image]");
*str = string_replace_at(*str, start-*str, repl_len, text_alt);
// if there had been a href attribute,
// add it to the URLs
if (text_src && urls) {
text_alt = string_replace_all("]", "", text_alt);
text_alt = string_replace_all("[", "", text_alt);
char *url = g_strdup_printf("[%s] %s", text_alt, text_src);
*urls = string_append(*urls, url, "\n");
g_free(url);
}
g_free(text_src);
g_free(text_alt);
}
}
/*
* Strip any markup from text; turn it in to plain text.
*
@ -96,6 +270,8 @@ char *markup_transform(char *str, enum markup_mode markup_mode)
break;
case MARKUP_FULL:
str = markup_br2nl(str);
markup_strip_a(&str, NULL);
markup_strip_img(&str, NULL);
break;
}

View File

@ -5,6 +5,10 @@
#include "settings.h"
char *markup_strip(char *str);
void markup_strip_a(char **str, char **urls);
void markup_strip_img(char **str, char **urls);
char *markup_transform(char *str, enum markup_mode markup_mode);
#endif

View File

@ -252,46 +252,6 @@ void notification_replace_single_field(char **haystack,
g_free(input);
}
char *notification_extract_markup_urls(char **str_ptr)
{
char *start, *end, *replace_buf, *str, *urls = NULL, *url, *index_buf;
int linkno = 1;
str = *str_ptr;
while ((start = strstr(str, "<a href")) != NULL) {
end = strstr(start, ">");
if (end != NULL) {
replace_buf = g_strndup(start, end - start + 1);
url = extract_urls(replace_buf);
if (url != NULL) {
str = string_replace(replace_buf, "[", str);
index_buf = g_strdup_printf("[#%d]", linkno++);
if (urls == NULL) {
urls = g_strconcat(index_buf, " ", url, NULL);
} else {
char *tmp = urls;
urls = g_strconcat(tmp, "\n", index_buf, " ", url, NULL);
g_free(tmp);
}
index_buf[0] = ' ';
str = string_replace("</a>", index_buf, str);
g_free(index_buf);
g_free(url);
} else {
str = string_replace(replace_buf, "", str);
str = string_replace("</a>", "", str);
}
g_free(replace_buf);
} else {
break;
}
}
*str_ptr = str;
return urls;
}
/*
* Create notification struct and initialise all fields with either
* - the default (if it's not needed to be freed later)
@ -479,15 +439,26 @@ static void notification_format_message(notification *n)
static void notification_extract_urls(notification *n)
{
// DO markup urls processing here until we split this out correctly
n->urls = notification_extract_markup_urls(&(n->body));
g_clear_pointer(&n->urls, g_free);
char *tmp = g_strconcat(n->summary, " ", n->body, NULL);
char *urls_in = string_append(g_strdup(n->summary), n->body, " ");
char *tmp_urls = extract_urls(tmp);
n->urls = string_append(n->urls, tmp_urls, "\n");
g_free(tmp_urls);
g_free(tmp);
char *urls_a = NULL;
char *urls_img = NULL;
markup_strip_a(&urls_in, &urls_a);
markup_strip_img(&urls_in, &urls_img);
// remove links and images first to not confuse
// plain urls extraction
char *urls_text = extract_urls(urls_in);
n->urls = string_append(n->urls, urls_a, "\n");
n->urls = string_append(n->urls, urls_img, "\n");
n->urls = string_append(n->urls, urls_text, "\n");
g_free(urls_in);
g_free(urls_a);
g_free(urls_img);
g_free(urls_text);
}
static void notification_dmenu_string(notification *n)

View File

@ -72,7 +72,7 @@ typedef struct _notification {
/* derived fields */
char *msg; /* formatted message */
char *text_to_render; /* formatted message (with age and action indicators) */
char *urls; /* urllist */
char *urls; /* urllist delimited by '\n' */
} notification;
notification *notification_create(void);

View File

@ -45,12 +45,108 @@ TEST test_markup_transform(void)
ASSERT_STR_EQ("<i>foo</i> bar baz", (ptr=markup_transform(g_strdup("<i>foo</i><br>bar\nbaz"), MARKUP_FULL)));
g_free(ptr);
// Test replacement of img and a tags, not renderable by pango
ASSERT_STR_EQ("foo bar bar baz", (ptr=markup_transform(g_strdup("<img alt=\"foo bar\"><br>bar\nbaz"), MARKUP_FULL)));
g_free(ptr);
ASSERT_STR_EQ("test ", (ptr=markup_transform(g_strdup("test <img alt=\"foo bar\""), MARKUP_FULL)));
g_free(ptr);
ASSERT_STR_EQ("test [image] image", (ptr=markup_transform(g_strdup("test <img src=\"nothing.jpg\"> image"), MARKUP_FULL)));
g_free(ptr);
ASSERT_STR_EQ("bar baz", (ptr=markup_transform(g_strdup("<a href=\"asdf\">bar</a> baz"), MARKUP_FULL)));
g_free(ptr);
PASS();
}
TEST helper_markup_strip_a (const char *in, const char *exp, const char *urls)
{
// out_urls is a return parameter and the content should be ignored
char *out_urls = (char *)0x04; //Chosen by a fair dice roll
char *out = g_strdup(in);
char *msg = g_strconcat("url: ", in, NULL);
markup_strip_a(&out, &out_urls);
ASSERT_STR_EQm(msg, exp, out);
if (urls) {
ASSERT_STR_EQm(msg, urls, out_urls);
} else {
ASSERT_EQm(msg, urls, out_urls);
}
g_free(out_urls);
g_free(out);
g_free(msg);
PASS();
}
TEST test_markup_strip_a(void)
{
RUN_TESTp(helper_markup_strip_a, "<a href=\"https://url.com\">valid</a> link", "valid link", "[valid] https://url.com");
RUN_TESTp(helper_markup_strip_a, "<a href=\"\">valid</a> link", "valid link", "[valid] ");
RUN_TESTp(helper_markup_strip_a, "<a>valid</a> link", "valid link", NULL);
RUN_TESTp(helper_markup_strip_a, "<a href=\"https://url.com\">valid link", "valid link", "[valid link] https://url.com");
RUN_TESTp(helper_markup_strip_a, "<a href=\"https://url.com\" invalid</a> link", " link", NULL);
RUN_TESTp(helper_markup_strip_a, "<a invalid</a> link", " link", NULL);
PASS();
}
TEST helper_markup_strip_img (const char *in, const char *exp, const char *urls)
{
// out_urls is a return parameter and the content should be ignored
char *out_urls = (char *)0x04; //Chosen by a fair dice roll
char *out = g_strdup(in);
char *msg = g_strconcat("url: ", in, NULL);
markup_strip_img(&out, &out_urls);
ASSERT_STR_EQm(msg, exp, out);
if (urls) {
ASSERT_STR_EQm(msg, urls, out_urls);
} else {
ASSERT_EQm(msg, urls, out_urls);
}
g_free(out_urls);
g_free(out);
g_free(msg);
PASS();
}
TEST test_markup_strip_img(void)
{
RUN_TESTp(helper_markup_strip_img, "v <img> img", "v [image] img", NULL);
RUN_TESTp(helper_markup_strip_img, "v <img alt=\"valid\" alt=\"invalid\"> img", "v valid img", NULL);
RUN_TESTp(helper_markup_strip_img, "v <img src=\"url.com\"> img", "v [image] img", "[image] url.com");
RUN_TESTp(helper_markup_strip_img, "v <img alt=\"valid\" src=\"url.com\"> img", "v valid img", "[valid] url.com");
RUN_TESTp(helper_markup_strip_img, "v <img src=\"url.com\" alt=\"valid\"> img", "v valid img", "[valid] url.com");
RUN_TESTp(helper_markup_strip_img, "v <img src=\"url.com\" alt=\"valid\" alt=\"i\"> img", "v valid img", "[valid] url.com");
RUN_TESTp(helper_markup_strip_img, "i <img alt=\"invalid src=\"https://url.com\"> img", "i [image] img", "[image] https://url.com");
RUN_TESTp(helper_markup_strip_img, "i <img alt=\"broken\" src=\"https://url.com > img", "i broken img", NULL);
RUN_TESTp(helper_markup_strip_img, "i <img alt=\"invalid src=\"https://url.com > img", "i [image] img", NULL);
RUN_TESTp(helper_markup_strip_img, "i <img src=\"url.com alt=\"broken\"> img", "i broken img", NULL);
RUN_TESTp(helper_markup_strip_img, "i <img src=\"url.com\" alt=\"invalid > img", "i [image] img", "[image] url.com");
RUN_TESTp(helper_markup_strip_img, "i <img src=\"url.com alt=\"invalid > img", "i [image] img", NULL);
RUN_TESTp(helper_markup_strip_img, "i <img src=\"url.com\" alt=\"invalid\" img", "i ", NULL);
PASS();
}
SUITE(suite_markup)
{
RUN_TEST(test_markup_strip);
RUN_TEST(test_markup_strip_a);
RUN_TEST(test_markup_strip_img);
RUN_TEST(test_markup_transform);
}