/* copyright 2013 Sascha Kruse and contributors (see LICENSE for licensing information) */ #include "markup.h" #include #include #include #include #include #include "log.h" #include "settings.h" #include "utils.h" /** * Convert all HTML special symbols to HTML entities. * @param str (nullable) */ static char *markup_quote(char *str) { ASSERT_OR_RET(str, NULL); str = string_replace_all("&", "&", str); str = string_replace_all("\"", """, str); str = string_replace_all("'", "'", str); str = string_replace_all("<", "<", str); str = string_replace_all(">", ">", str); return str; } /** * Convert all HTML special entities to their actual char. * @param str (nullable) */ static char *markup_unquote(char *str) { ASSERT_OR_RET(str, NULL); str = string_replace_all(""", "\"", str); str = string_replace_all("'", "'", str); str = string_replace_all("<", "<", str); str = string_replace_all(">", ">", str); str = string_replace_all("&", "&", str); return str; } /** * Convert all HTML linebreak tags to a newline character * @param str (nullable) */ static char *markup_br2nl(char *str) { ASSERT_OR_RET(str, NULL); str = string_replace_all("
", "\n", str); str = string_replace_all("
", "\n", str); str = string_replace_all("
", "\n", str); return str; } /* see markup.h */ void markup_strip_a(char **str, char **urls) { assert(*str); char *tag1 = NULL; if (urls) *urls = NULL; while ((tag1 = strstr(*str, ""); char *tag2 = strstr(tag1, ""); // the tag is broken, ignore it if (!tag1_end) { LOG_W("Given link is broken: '%s'", tag1); string_replace_at(*str, tag1-*str, strlen(tag1), ""); break; } if (tag2 && tag2 < tag1_end) { int repl_len = (tag2 - tag1) + strlen(""); LOG_W("Given link is broken: '%.*s.'", repl_len, tag1); string_replace_at(*str, tag1-*str, repl_len, ""); break; } // search contents of href attribute char *plain_url = NULL; if (href && href < tag1_end) { // shift href to the actual begin of the value href = href+6; const char *quote = strstr(href, "\""); if (quote && quote < tag1_end) { plain_url = g_strndup(href, quote-href); } } // text between a tags int text_len; if (tag2) text_len = tag2 - (tag1_end+1); else text_len = strlen(tag1_end+1); char *text = g_strndup(tag1_end+1, text_len); int repl_len = text_len + (tag1_end-tag1) + 1; repl_len += tag2 ? strlen("") : 0; *str = string_replace_at(*str, tag1-*str, repl_len, text); // if there had been a href attribute, // add it to the URLs if (plain_url && urls) { text = string_replace_all("]", "", text); text = string_replace_all("[", "", text); char *url = g_strdup_printf("[%s] %s", text, plain_url); *urls = string_append(*urls, url, "\n"); g_free(url); } g_free(plain_url); g_free(text); } } /* see markup.h */ void markup_strip_img(char **str, char **urls) { const char *start; if (urls) *urls = NULL; while ((start = strstr(*str, ""); // the tag is broken, ignore it if (!end) { LOG_W("Given image is broken: '%s'", start); string_replace_at(*str, start-*str, strlen(start), ""); break; } // use attribute=" as stated in the notification spec const char *alt_s = strstr(start, "alt=\""); const char *src_s = strstr(start, "src=\""); char *text_alt = NULL; char *text_src = NULL; const char *src_e = NULL, *alt_e = NULL; if (alt_s) alt_e = strstr(alt_s + strlen("alt=\""), "\""); if (src_s) src_e = strstr(src_s + strlen("src=\""), "\""); // Move pointer to the actual start alt_s = alt_s ? alt_s + strlen("alt=\"") : NULL; src_s = src_s ? src_s + strlen("src=\"") : NULL; /* check if alt and src attribute are given * If both given, check the alignment of all pointers */ if ( alt_s && alt_e && src_s && src_e && ( (alt_s < src_s && alt_e < src_s-strlen("src=\"") && src_e < end) ||(src_s < alt_s && src_e < alt_s-strlen("alt=\"") && alt_e < end)) ) { text_alt = g_strndup(alt_s, alt_e-alt_s); text_src = g_strndup(src_s, src_e-src_s); /* check if single valid alt attribute is available */ } else if (alt_s && alt_e && alt_e < end && (!src_s || src_s < alt_s || alt_e < src_s - strlen("src=\""))) { text_alt = g_strndup(alt_s, alt_e-alt_s); /* check if single valid src attribute is available */ } else if (src_s && src_e && src_e < end && (!alt_s || alt_s < src_s || src_e < alt_s - strlen("alt=\""))) { text_src = g_strndup(src_s, src_e-src_s); } else { LOG_W("Given image argument is broken: '%.*s'", (int)(end-start), start); } // replacement text for alt int repl_len = end - start + 1; if (!text_alt) text_alt = g_strdup("[image]"); *str = string_replace_at(*str, start-*str, repl_len, text_alt); // if there had been a href attribute, // add it to the URLs if (text_src && urls) { text_alt = string_replace_all("]", "", text_alt); text_alt = string_replace_all("[", "", text_alt); char *url = g_strdup_printf("[%s] %s", text_alt, text_src); *urls = string_append(*urls, url, "\n"); g_free(url); } g_free(text_src); g_free(text_alt); } } /* see markup.h */ char *markup_strip(char *str) { ASSERT_OR_RET(str, NULL); /* strip all tags */ string_strip_delimited(str, '<', '>'); /* unquote the remainder */ str = markup_unquote(str); return str; } /** * Determine if an & character pointed to by \p str is a markup & entity or * part of the text * * @retval true: \p str is an entity * @retval false: It's no valid entity */ static bool markup_is_entity(const char *str) { assert(str); assert(*str == '&'); char *end = strchr(str, ';'); ASSERT_OR_RET(end, false); // Parse (hexa)decimal entities with the format Ӓ or ઼ if (str[1] == '#') { const char *cur = str + 2; if (*cur == 'x') { cur++; // Reject &#x; if (*cur == ';') return false; while (isxdigit(*cur) && cur < end) cur++; } else { // Reject &#; if (*cur == ';') return false; while (isdigit(*cur) && cur < end) cur++; } return (cur == end); } else { const char *supported_tags[] = {"&", "<", ">", """, "'"}; for (int i = 0; i < sizeof(supported_tags)/sizeof(*supported_tags); i++) { if (g_str_has_prefix(str, supported_tags[i])) return true; } return false; } } /** * Escape all unsupported and invalid &-entities in a string. If the resulting * string does not fit it will be reallocated. * * @param str The string to be transformed */ static char *markup_escape_unsupported(char *str) { ASSERT_OR_RET(str, NULL); char *match = str; while ((match = strchr(match, '&'))) { if (!markup_is_entity(match)) { int pos = match - str; str = string_replace_at(str, pos, 1, "&"); match = str + pos + strlen("&"); } else { match++; } } return str; } /* see markup.h */ char *markup_transform(char *str, enum markup_mode markup_mode) { ASSERT_OR_RET(str, NULL); switch (markup_mode) { case MARKUP_NULL: /* `assert(false)`, but with a meaningful error message */ assert(markup_mode != MARKUP_NULL); break; case MARKUP_NO: str = markup_quote(str); break; case MARKUP_STRIP: str = markup_br2nl(str); str = markup_strip(str); str = markup_quote(str); break; case MARKUP_FULL: str = markup_escape_unsupported(str); str = markup_br2nl(str); markup_strip_a(&str, NULL); markup_strip_img(&str, NULL); break; } if (settings.ignore_newline) { str = string_replace_all("\n", " ", str); } return str; } /* vim: set tabstop=8 shiftwidth=8 expandtab textwidth=0: */