replacing text from a paste when looping over html elements - javascript

I am trying to replace html links (and eventually other elements) with bbcode when a user does a paste from a document (like gdocs or libre office). So we are dealing with rich html already formatted (which is why it needs to copy HTML and not text).
Essentially, I want to be able to copy stuff pre-written from a document into a textarea on my website without having to manually write BBCode tags in the original document (as it's messy for proof-reading).
Thanks to the help here Adjust regex to ignore anything else inside link HTML tags I have gotten mostly there, but I am stuck on replacing the found tags with the original text.
Here's what I have:
function fragmentFromString(strHTML) {
return document.createRange().createContextualFragment(strHTML);
}
$('textarea').on('paste',function(e) {
e.preventDefault();
var text = (e.originalEvent || e).clipboardData.getData('text/html') || prompt('Paste something..');
var fragment = fragmentFromString(text);
var aTags = Array.from(fragment.querySelectorAll('a'));
aTags.forEach(a => {
text = text.replace(a, "[url="+a.href+"]"+a.textContent+"[/url]");
});
window.document.execCommand('insertText', false, text);
});
You can see it loops over the found a tags and I am essentially trying to replace them from the original text with the new stuff.
Here's an example of the type of content that could be pasted (this is a single link from google docs):
<span style="font-size:14.666666666666666px;font-family:Arial;color:#1155cc;background-color:transparent;font-weight:700;font-style:normal;font-variant:normal;text-decoration:none;vertical-align:baseline;white-space:pre-wrap;">Link test</span>
Expected to be replaced with:
[url=https://www.test.com]Link test[/url]
So I want that HTML replaced, with the BBCode within the original text that's then sent to the textarea from the paste.

The aTags foreach currently does nothing. You need to create a new text node, and replace the existing anchor tag with it.
aTags.forEach(a => {
var new_text = document.createTextNode("[url=" + a.href + "]" + a.textContent + "[/url]");
a.parentNode.insertBefore(new_text, a);
a.parentNode.removeChild(a);
});
window.document.execCommand('insertText', false, text.innerText);
This will replace every a tag into the given text.

Related

Convert HTML to plain text keeping links, bold and italic in Javascript

I am configuring an API to send an email using the content of a publication as the body of the email. The text editor used for the publication save the text in HTML so I needed to convert the result into plain text. There are other questions that have gave me a solution, but I would like to keep from the original text the bold text, italic and the links. So this is what I have:
Body of a test publicacion:
This is bold text.This is regular text.This is italic.This is a link.
Then in the script I have the following function:
function htmlToText(html){
//remove code brakes and tabs
html = html.replace(/\n/g, "");
html = html.replace(/\t/g, "");
//keep html brakes and tabs
html = html.replace(/<\/td>/g, "\t");
html = html.replace(/<\/table>/g, "\n");
html = html.replace(/<\/tr>/g, "\n");
html = html.replace(/<\/p>/g, "\n");
html = html.replace(/<\/div>/g, "\n");
html = html.replace(/<\/h>/g, "\n");
html = html.replace(/<br>/g, "\n"); html = html.replace(/<br( )*\/>/g, "\n");
html = html.replace(/<a.*href="(.*?)".*>(.*?)<\/a>/gi, " $2 (Link->$1) ");
//parse html into text
var dom = (new DOMParser()).parseFromString('<!doctype html><body>' + html, 'text/html');
return dom.body.textContent;
}
That gives me some plain text with nice line breaks, but I was wondering if I could get the bold, italic and links.
Thanks.
I had some time on my hands and played around. This is what I came up with:
const copy=document.createElement("div");
copy.innerHTML=container.innerHTML.replace(/\n/g," ").replace(/[\t\n]+/g,"");
const tags={B:["**","**",1], // [<prefix>, <postfix>, <sequence-number> ]
I:["*","*",2],
H2:["##","\n",3],
P:["\n","\n",4],
DIV:["","\n",5],
TD:["","\t",6]};
[...copy.querySelectorAll(Object.keys(tags).join(","))]
.sort((a,b)=>tags[a.tagName][2]-tags[b.tagName][2])
.forEach(e=>{
const [a,b]=tags[e.tagName];
e.innerHTML=(e.matches("TD:first-child") ? "\n": a) + e.innerHTML + b;
});
console.log(copy.textContent.replace(/^ */mg,""));
<div id="container">
<H2>Second level heading</H2>
<div><div>
A <b>first div</b> with a
link (abc) and a
<p>paragraph having itself another link (def) in it.</p>
</div>
</div>
And here is some more <i>"lost" text</i> ...
<table>
<tr><td>one</td><td><b>two</b></td><td>three</td></tr>
<tr><td>a</td><td>b</td><td>c</td></tr>
<tr><td>d</td><td>e</td><td>f</td></tr>
</table>
</div>
Instead of using regexp to "parse" the html I chose to actually treat it in a DOM way: I create a new div element (copy) into which I insert the original .innerHTML. For particular element types I then define some pre- and postfixes that should surround the original .innerHTML. These are stored in tags and applied on the freshly created div element.
This is done by selecting all of the "special" elements (as specified by the tags-keys) and processing them in a given sequential order. Afterwards I simply return the .textContent of the modified copy element.
Plain text cannot really render bold or italics text decoration. For this reason I used modifiers in the markdown style (*:italics, **:bold)

How to remove content within the &lt and &gt javascript

I have a content that contains a string of elements along with images. ex:
var str= <p><img src=\"v\">fwefwefw</img></p><p><br></p><p><br></p>
the text that is within the &lt and &gt is a dirty tag and I would like to remove it along with the content that is within it. the tag is generated dynamically and hence could be any tag i.e <div>, <a>, <h1> etc....
the expected output : <p></p><p><br></p><p><br></p>
however with this code, im only able to remove the tags and not the content inside it.
str.replaceAll(/<.*?>/g, "");
it renders like this which is not what im looking for:
<p>fwefwefw</p><p><br></p><p><br></p><p><br></p>
how can I possibly remove the & tags along with the content so that I get rid of dirty tags and text inside it?
fiddle: https://jsfiddle.net/3rozjn8m/
thanks
A safe way is to use a DOM parser, visiting each text node, where then each text can be cleaned separately. This way you are certain the DOM structure is not altered; only the texts:
let str= "<p><img src=\"v\">fwefwefw</img></p><p><br></p><p><br></p>";
let doc = new DOMParser().parseFromString(str, "text/html");
let walk = doc.createTreeWalker(doc.body, 4, null, false);
let node = walk.nextNode();
while (node) {
node.nodeValue = node.nodeValue.replace(/<.*>/gs, "");
node = walk.nextNode();
}
let clean = doc.body.innerHTML;
console.log(clean);
This will also work when you have more than one <p> element that has such content.
Remove the question mark.
var str= "<p><img src=\"v\">fwefwefw</img></p><p><br></p><p><br></p>";
console.log(str.replaceAll(/<.*>/g, ""));

How can I Strip all regular html tags except <a></a>, <img>(attributes inside) and <br> with javascript?

When a user create a message there is a multibox and this multibox is connected to a design panel which lets users change fonts, color, size etc.. When the message is submited the message will be displayed with html tags if the user have changed color, size etc on the font.
Note: I need the design panel, I know its possible to remove it but this is not the case :)
It's a Sharepoint standard, The only solution I have is to use javascript to strip these tags when it displayed. The user should only be able to insert links, images and add linebreaks.
Which means that all html tags should be stripped except <a></a>, <img> and <br> tags.
Its also important that the attributes inside the the <img> tag that wont be removed. It could be isplayed like this:
<img src="/image/Penguins.jpg" alt="Penguins.jpg" style="margin:5px;width:331px;">
How can I accomplish this with javascript?
I used to use this following codebehind C# code which worked perfectly but it would strip all html tags except <br> tag only.
public string Strip(string text)
{
return Regex.Replace(text, #"<(?!br[\x20/>])[^<>]+>", string.Empty);
}
Any kind of help is appreciated alot
Does this do what you want? http://jsfiddle.net/smerny/r7vhd/
$("body").find("*").not("a,img,br").each(function() {
$(this).replaceWith(this.innerHTML);
});
Basically select everything except a, img, br and replace them with their content.
Smerny's answer is working well except that the HTML structure is like:
var s = '<div><div>Link<span> Span</span><li></li></div></div>';
var $s = $(s);
$s.find("*").not("a,img,br").each(function() {
$(this).replaceWith(this.innerHTML);
});
console.log($s.html());
The live code is here: http://jsfiddle.net/btvuut55/1/
This happens when there are more than two wrapper outside (two divs in the example above).
Because jQuery reaches the most outside div first, and its innerHTML, which contains span has been retained.
This answer $('#container').find('*:not(br,a,img)').contents().unwrap() fails to deal with tags with empty content.
A working solution is simple: loop from the most inner element towards outside:
var $elements = $s.find("*").not("a,img,br");
for (var i = $elements.length - 1; i >= 0; i--) {
var e = $elements[i];
$(e).replaceWith(e.innerHTML);
}
The working copy is: http://jsfiddle.net/btvuut55/3/
with jQuery you can find all the elements you don't want - then use unwrap to strip the tags
$('#container').find('*:not(br,a,img)').contents().unwrap()
FIDDLE
I think it would be better to extract to good tags. It is easy to match a few tags than to remove the rest of the element and all html possibilities. Try something like this, I tested it and it works fine:
// the following regex matches the good tags with attrinutes an inner content
var ptt = new RegExp("<(?:img|a|br){1}.*/?>(?:(?:.|\n)*</(?:img|a|br){1}>)?", "g");
var input = "<this string would contain the html input to clean>";
var result = "";
var match = ptt.exec(input);
while (match) {
result += match;
match = ptt.exec(input);
}
// result will contain the clean HTML with only the good tags
console.log(result);

string search in body.html() not working

Hi here is my total work to search a string in HTML and highlight it if it is found in document:
The problem is here
var SearchItems = text.split(/\r\n|\r|\n/);
var replaced = body.html();
for(var i=0;i<SearchItems.length;i++)
{
var tempRep= '<span class="highlight" style="background-color: yellow">';
tempRep = tempRep + SearchItems[i];
tempRep = tempRep + '</span>';
replaced = replaced.replace(SearchItems[i],tempRep); // It is trying to match along with html tags...
// As the <b> tags will not be there in search text, it is not matching...
}
$("body").html(replaced);
The HTML I'm using is as follows;
<div>
The clipboardData object is reserved for editing actions performed through the Edit menu, shortcut menus, and shortcut keys. It transfers information using the system clipboard, and retains it until data from the next editing operation replace s it. This form of data transfer is particularly suited to multiple pastes of the same data.
<br><br>
This object is available in script as of <b>Microsoft Internet Explorer 5.</b>
</div>
<div class='b'></div>
If I search for a page which is pure or without any html tags it will match. However, if I have any tags in HTML this will not work.. Because I am taking body html() text as the target text. It is exactly trying to match along with html tags..
In fiddle second paragraph will not match.
First of all, to ignore the HTML tags of the element to look within, use the .text() method.
Secondly, in your fiddle, it wasn't working because you weren't calling the SearchQueue function on load.
Try this amended fiddle

workaround for ckeditor bug nesting divs bug?

Any suggestions or workarounds for this bug?
http://dev.ckeditor.com/ticket/6436
i really need to get around this bug as i need to delete whole divs with one backspace in the editor area in CKEditor. but on insertion the divs get nested due to the bug. So, deletion of individual divs become impossible.
Here is the workaround i finally designed--
http://thecamelcase.com/2011/06/reining-in-the-cursor-in-ckeditor/
I encountered this same issue and unfortunately the link to the fix made by ghostCoder isn't available anymore.
In addition to nested divs issue, we have forced paste as plain text (so data is pasted as plain text unless "paste from word" is used) and we are using div as our enterMode. In the pasted data line breaks were replaced with <br /> tags whereas we wanted each line to be wrapper inside element.
This our how I eventually solved the issue. You should know that we have not changed autoParagraph config option so it defaults to true and I don't recommend disabling it in config level. I'll try to code comment this solution well so that you get a good idea what is actually done here.
config.on.instanceReady = function(e) {
var enterMode = e.editor.config.enterMode, elem = 'div';
if(enterMode === CKEDITOR.ENTER_P) {
elem = 'p';
}
// We didn't encounter any issues when using br as enterMode so
// continue only if enterMode is div or p element
if(enterMode === CKEDITOR.ENTER_DIV || enterMode === CKEDITOR.ENTER_P) {
// Disable autoParagraph in initialization to avoid nested div issue.
// When autoParagraph is manipulated this way it will still be active
// later on so if you write some inline content in source mode it
// will be properly wrapped inside <p> or <div> element.
e.editor.config.autoParagraph = false;
// Handle paste event to get rid of <br /> line breaks issue
e.editor.on('paste', function(evt) {
var data = '';
// Stop event, handle everything here manually
evt.stop();
// Get data either from event's text or html property
if(evt.data.text) {
// Remove HTML markup from pasted data
data = evt.data.text.replace(/(<([^>]+)>)/ig, '');
// Replace all line breaks with </div><div> or </p><p> markup
// And wrap the whole content inside <div> or <p> element
data = '<' + elem + '>'
+ data.replace(/\r\n|\r|\n/g, '</' + elem + '><' + elem + '>')
+ '</' + elem + '>';
} else {
// If data was not pasted as plain text just
// get data as is from event's html property
data = evt.data.html;
}
// Insert HTML data to editor
evt.editor.insertHtml(data);
// Fire afterPaste manually as we have stopped the event
// and afterPaste wouldn't get triggered otherwise
evt.editor.fire( 'afterPaste' );
}, e.editor.element.$);
}
};

Categories