serializing dynamically created html with indentation - javascript

Having created a bunch of elements in an html document with appendChild(), I am trying to to save the modified page on the client. Sending it off to the server seems a bit unnecessary, so I've opted for :
var save = document.createElement("a");
save.classList.add("button");
save.textContent = "save";
save.download = "layout-save.html"
save.onclick = function(event) {
var output = [];
// serialize document to output
var file = new window.Blob(output,{type:"text/html"});
save.href = window.URL.createObjectURL(file);
}
document.body.appendChild(save);
However, the newly created elements aren't indented of course. I've been looking at js-beautify but I also noticed that the mozilla page on parsing and serializing claims that you can use treewalker.
Would anyone know how I might go about doing such a thing? Or failing that, would there be a way to serialize a node without it's children in order to run a recursive loop like this :
var output = [];
var serializer = new XMLSerializer();
function indent(node) {
var ancestor = node;
while (ancestor != document.documentElement) {
output.push(" ");
ancestor = ancestor.parentNode;
}
output.push(/* serialize node tagname + attributes */);
output.push("\n");
for (let child of node.children) {
indent(child);
}
output.push(/* node closing tag*/);
}
indent(document.documentElement);
Don't hesitate tell me if I'm barking up the wrong tree, and thank you for your time.

By way of a reply to my own question, you can serialize a shallow clone to get the opening and closing tags of a node :
var save = document.createElement("a");
save.classList.add("button");
save.textContent = "save";
save.download = "layout.html"
save.onclick = function(event) {
document.body.removeChild(save);
var output = [];
var serializer = new XMLSerializer();
function indent(node) {
function offset(node) {
var count = 0;
var ancestor = node;
while (ancestor != document.documentElement) {
count++;
ancestor = ancestor.parentNode;
}
return "\t".repeat(count);
}
var buffer = offset(node);
var nodeClone = serializer.serializeToString(node.cloneNode(false)).replace(' xmlns="http://www.w3.org/1999/xhtml"',"");
if (node.children.length) {
let tagSplit = nodeClone.replace(/(<.+>)(<\/.+>)/,"$1<!--children-->$2").split("<!--children-->");
output.push(buffer + tagSplit[0] + "\n");
for (let child of node.children) {
indent(child);
}
output.push(buffer + tagSplit[1] + "\n");
} else {
output.push(buffer + nodeClone + "\n");
}
}
indent(document.documentElement);
var file = new window.Blob(output,{type:"text/html"});
save.href = window.URL.createObjectURL(file);
}
document.body.appendChild(save);
manually removing the xhtml namespace is a bit of a shame but since it's XMLSerializer I couldn't see any way around that.

Related

How to convert XML into Javascript object using Javascript code? [duplicate]

I am looking for a JavaScript library that parses an XML string and converts it to a JavaScript object. What are some good ones?
The following function parses XML and returns a JavaScript object with a scheme that corresponds to the XML. XML siblings w/ the same name are collapsed into arrays. nodes with names that can be found in the arrayTags parameter (array of tag name strings) always yield arrays even in case of only one tag occurrence. arrayTags can be omitted. Text nodes with only spaces are discarded.
function parseXml(xml, arrayTags) {
let dom = null;
if (window.DOMParser) dom = (new DOMParser()).parseFromString(xml, "text/xml");
else if (window.ActiveXObject) {
dom = new ActiveXObject('Microsoft.XMLDOM');
dom.async = false;
if (!dom.loadXML(xml)) throw dom.parseError.reason + " " + dom.parseError.srcText;
}
else throw new Error("cannot parse xml string!");
function parseNode(xmlNode, result) {
if (xmlNode.nodeName == "#text") {
let v = xmlNode.nodeValue;
if (v.trim()) result['#text'] = v;
return;
}
let jsonNode = {},
existing = result[xmlNode.nodeName];
if (existing) {
if (!Array.isArray(existing)) result[xmlNode.nodeName] = [existing, jsonNode];
else result[xmlNode.nodeName].push(jsonNode);
}
else {
if (arrayTags && arrayTags.indexOf(xmlNode.nodeName) != -1) result[xmlNode.nodeName] = [jsonNode];
else result[xmlNode.nodeName] = jsonNode;
}
if (xmlNode.attributes) for (let attribute of xmlNode.attributes) jsonNode[attribute.nodeName] = attribute.nodeValue;
for (let node of xmlNode.childNodes) parseNode(node, jsonNode);
}
let result = {};
for (let node of dom.childNodes) parseNode(node, result);
return result;
}
Here's a nice xml2json and json2xml converter:
http://goessner.net/download/prj/jsonxml/
Related tutorial: http://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html
Here's another one:
http://www.kawa.net/works/js/xml/objtree-e.html
Depending on your needs, you might be able to use a standard parser (see http://www.w3schools.com/XML/tryit.asp?filename=tryxml_parsertest2) and xpath (http://www.w3schools.com/xpath/default.asp) - here's an example:
http://snippets.dzone.com/posts/show/5272
and a few nice tutorials:
http://www.nczonline.net/blog/2009/03/17/xpath-in-javascript-part-1/
https://developer.mozilla.org/en/introduction_to_using_xpath_in_javascript
Going straight to the point (using node-xml2json):
npm install xml2json
Then, use it:
const parser = require('xml2json');
const obj = parser.toJson(xml, { object: true });
Example:
const parser = require('xml2json');
const xml = '<root><person><name>Bob Dylan</name></person></root>';
const obj = parser.toJson(xml, { object: true });
const { person } = obj.root;
person.name; // Bob Dylan
You can also convert from JSON to XML, and much more.
I wanted a simple Typescript version that didn't create additional #text objects and also disregarded attributes. If that's what you need, here's the code:
export class DomFuncs {
static parseNode = (node: Node) => {
const childNodes = node.childNodes;
if (childNodes.length === 0) {
return node.nodeValue;
} else if (childNodes.length === 1 && childNodes[0].nodeType === Node.TEXT_NODE) {
return childNodes[0].nodeValue;
} else {
const obj = {};
childNodes.forEach(childNode => {
const childName = childNode.nodeName;
const childValue = obj[childName];
if (childValue !== undefined) {
if (Array.isArray(childValue)) {
childValue.push(DomFuncs.parseNode(childNode));
} else {
obj[childName] = [childValue, DomFuncs.parseNode(childNode)];
}
} else {
obj[childName] = DomFuncs.parseNode(childNode);
}
});
return obj;
}
};
static xml2obj = (str: string) => {
const dom = (new DOMParser()).parseFromString(str, 'text/xml')
const result = {[dom.nodeName]: DomFuncs.parseNode(dom)};
return result;
}
}
To use it:
DomFuncs.xml2obj(xmlString);
This script currently disregards XML attributes since my converted object didn't require them. If you need that, let me know and I could update the code.
The xml2json javascript file from https://bitbucket.org/surenrao/xml2json is all you need to do this.
Here's the download link for quick download: https://bitbucket.org/surenrao/xml2json/get/0e0989dfe48e.zip
Once included in your project, here's some sample code to get you started:
var xmlStr = "<root><person><name>Bob Dylan</name></person></root>";
var jsObj = X2J.parseXml(xmlStr);
var result = jsObj[0].root[0].person[0].name[0].jValue; //Bob Dylan

If I know where the information I want is on a site, how can I scrape it in javascript?

I have a problem, I know what information I want to scrape of a website and I also know where the information is at. I know in what class it's in and also the xpath.
The problem I'm having is that no matter what I try, it seems like I can't scrape the content.
This is my scrape function:
function scrape(doc, url) {
var itemType = detectWeb(doc, doc.location.href);
var keywords = new Array();
var keywordText = doc.evaluate('//div[span="Index Terms:"]/div', doc, null, XPathResult.ANY_TYPE, null).iterateNext();
if (keywordText) keywords = (Zotero.Utilities.trimInternal(keywordText.textContent.toLowerCase())).split(",");
var attachments = new Array();
var notes = new Array();
attachments.push({
document: doc,
mimeType: "text/html",
title: "IEEE Computer Snapshot"
});
var htmls = doc.evaluate('//img[#src="/plugins/images/digitalLibrary/dl_html_icon.gif"]/ancestor::a', doc, null, XPathResult.ANY_TYPE, null);
var htmlDoc;
//TESTING
//var affiliation = doc.getElementsByTagName('meta')[property='citation_author_institution'].content;
//var affiliations = [];
var abstracts;
if (htmlDoc = htmls.iterateNext()) {
//var urlField = htmlDoc.attributes.getNamedItem("onclick").value;
var urlField = htmlDoc.href;
urlField = urlField.substr(urlField.indexOf('"') + 1);
urlField = urlField.substr(0, urlField.indexOf('"'));
if (urlField.indexOf("?") > -1) {
urlField += '&' + templte;
} else {
urlField += '?' + templte;
}
urlField = "http://www2.computer.org" + urlField;
var mimeTypeField = "text/html";
var titleField = "IEEE Computer Full Text Snapshot";
var attachment = {
url: urlField,
mimeType: mimeTypeField,
title: titleField
};
attachments.push(attachment);
}
var pdfurl = ZU.xpathText(doc, '//div[#class="abs-pdf"]/a/#href')
if (pdfurl) {
var mimeTypeField = "application/pdf";
var titleField = "IEEE Computer Full Text PDF";
var attachment = {
url: pdfurl,
mimeType: mimeTypeField,
title: titleField
};
attachments.push(attachment);
} else {
notes.push({
note: "Complete PDF document was either not available or accessible. Please make sure you're logged in to the digital library to retrieve the complete PDF document."
});
}
var bibtex = doc.evaluate('//div[#id="bibText-content"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext();
var bibtexlink = ZU.xpathText(doc, '//li/a[contains(text(), "BibTex") and contains(#href, ".bib")]/#href')
if (bibtex) {
bibtex = bibtex.textContent;
//bibtex = bibtex.substring(bibtex.indexOf("document.write('")+16,bibtex.indexOf("');Popup.document.close();"));
//workaround as bibtex translator obviously needs a whitespace following the first curly brace
bibtex = Zotero.Utilities.cleanTags(bibtex);
bibtex = Zotero.Utilities.trimInternal(bibtex);
var translator = Zotero.loadTranslator("import");
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
translator.setString(bibtex);
translator.setHandler("itemDone", function(obj, item) {
if (item.url) { // add http to url
item.url = "http://" + item.url;
}
if (itemType) item.itemType = itemType;
item.attachments = attachments;
if (keywords) item.tags = keywords;
if (notes) item.notes = notes;
if (item.DOI) item.DOI = item.DOI.replace(/^.*?10\./, "10.");
//Affiliations
/*if (affiliation)
{
for (i=0; i<affiliations.length; i++)
{
affiliation.push(affiliations[i].textContent)
}
item.extra = affiliation.join("; ");
}*/
if (abstracts) {
item.abstractNote = abstracts;
}
item.complete();
});
translator.translate();
} else if (bibtexlink) {
ZU.doGet(bibtexlink, function(text) {
var translator = Zotero.loadTranslator("import");
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
translator.setString(text);
translator.setHandler("itemDone", function(obj, item) {
if (item.url) { // add http to url
item.url = "http://" + item.url;
}
if (itemType) item.itemType = itemType;
item.attachments = attachments;
if (keywords) item.tags = keywords;
if (notes) item.notes = notes;
if (item.DOI) item.DOI = item.DOI.replace(/^.*?10\./, "10.");
//Affiliations
/*if (affiliation)
{
for (i=0; i<affiliations.length; i++)
{
affiliation.push(affiliations[i].textContent)
}
item.extra = affiliation.join("; ");
}*/
//Abstract
if (abstracts) {
item.abstractNote = abstracts;
}
item.complete();
});
translator.translate();
})
} else {
throw "No BibTeX found!";
}
}
It's the variable called abstracts that I wanna fill with the abstract from this website.
ieee article
I used Firebug to locate where this information was stored and found it in the class="article" in the div="tabs-main".
It looks something like this:
<div id="tabs-main">
<!-- place holder -->
<div class="tab-content" id="articleDetails" role="main" data-section="articleDetails.ajax"
>
<div class="article-blk">
<div class="article">
(I want this)--> <p>Distributed database systems (DDBS) have received considerable attention in recent years. Being a relatively young research field, there are still many problems associated with DDB systems that need solution. Concurrency control is one of these problems and, probably, the most extensively studied. However, most of the work has concentrated on the development of alternative solutions and the field seems to be ready for some comparative analysis work. This paper reports the results of a performance evaluation study on distributed database concurrency control algorithms. The research has resulted in the development of a formalism, based on Petri nets, for modeling and analysis purposes. The formalism, called the Extended Place/Transition Nets (EPTN), is both descriptively powerful in that it can be used to model various algorithms precisely and succinctly and to communicate them in a clear manner, while at the same time lending itself to be used as a performance evaluation tool. An EPTN simulator is implemented and various algorithms are studied using this tool. This paper describes both the formalism and the performance results that have been obtained.</p>
</div>
And in Firebug I also get the XPath which is:
/html/body/div[2]/div[8]/div/div[2]/div/div[2]/div[1]/div/div[1]
But I don't know how I can get this content. I have tried with
var abstracts = doc.querySelector(".article").innerHTML;
I have tried with doc.getElementByClassName().
But I can never get the content, var is always null.
Someone out there have an idea?

"It is necessary to detach the element" error in google docs script

When I try to copy paragraphs form one doc to another I get unexpected error:
It is necessary to detach the element
What does it mean? What am I doing wrong?
function test_copy_paragrahps() {
var final = 'final';
var doc1 = get_doc('', final);
var doc2 = create_doc_in_path('', final+'test');
var body1 = doc1.getBody();
var body2 = doc2.getBody();
var par1 = body1.getParagraphs();
for (var i=0;i<par1.length;i++) {
body2.insertParagraph(i, par1[i]);
}
}
here is video http://youtu.be/1WdCD5ATiYw
P.S. You can not mention on get_doc and create_doc_in_path implementations. Both return Document object.
You attempted to insert a paragraph that already has a parent Body. You need to create a detached copy of the paragraph before you can insert it.
See this part of the documentation that mentions detaching a paragraph.
I believe this will fix the error:
function test_copy_paragrahps() {
var final = 'final';
var doc1 = get_doc('', final);
var doc2 = create_doc_in_path('', final+'test');
var body1 = doc1.getBody();
var body2 = doc2.getBody();
var par1 = body1.getParagraphs();
for (var i=0;i<par1.length;i++) {
body2.insertParagraph(i, par1[i].copy()); //--- copy()
}
}

Parse values from HTML element using Google App Script?

I am trying to parse HTML element by class on Google Sites, my code is:
function doGet(){
var html = UrlFetchApp.fetch ('http://indicadoresdeldia.cl/').getContentText();
var doc = XmlService.parse(html);
var html = doc.getRootElement();
var menu = getElementsByClassName(html, 'span3 utm')[0];
var output = XmlService.getRawFormat().format(menu);
return HtmlService.createHtmlOutput(output);
}
Ween i run the code appear the nexte error message ReferenceError: "getElementsByClassName" is not defined.
i am trying to deploy the example for the next page: https://sites.google.com/site/scriptsexamples/learn-by-example/parsing-html
Any ideas?
THanks in advance for your help.
According to that site, you should directly copy those functions to your project (source code available there) and then call them. That would alleviate each and every one of your problems.
Source: https://sites.google.com/site/scriptsexamples/learn-by-example/parsing-html
function getElementsByClassName(element, classToFind) {
var data = [];
var descendants = element.getDescendants();
descendants.push(element);
for(i in descendants) {
var elt = descendants[i].asElement();
if(elt != null) {
var classes = elt.getAttribute('class');
if(classes != null) {
classes = classes.getValue();
if(classes == classToFind) data.push(elt);
else {
classes = classes.split(' ');
for(j in classes) {
if(classes[j] == classToFind) {
data.push(elt);
break;
}
}
}
}
}
}
return data;
}

How to parse a text like XML to a javascript object

I created a function to do this.
var text="adsf [name]Victor[/name] dummytext [name]Elliot[/name] asdf [name]Jake[/name] asdf [foo]bar[/foo]";
alert( readTags(text,'name') ); //Victor,Elliot,Jake
alert( readTags(text,'foo') ); //bar
but now I like to implement a function that receive a string like this
[person]
[name]jake[/name]
[age]12[/age]
[/person]
and return a object like this
var object={};
object['person']={};
object['name']='jake';
object['age']='12';
return(object);
but I don't know how to loop through the text. How to deal with starting and ending tags?
like
[tag] [tag]value[/tag] [/tag]
I thought to find starting tag from left and ending tag from the right using indexOf('[tag]') and lastindexOf('[/tag]')
but doesn't work in this situation
[tag]value[/tag] [tag]value[/tag]
this is the previous function
function readTags(str,property){
var beginTag='['+property+']';
var endTag='[/'+property+']';
var values=new Array(0);
while(str.indexOf(beginTag)!=-1){
values[values.length]=strBetween(str,beginTag,endTag);
str=str.substring(str.indexOf(endTag)+endTag.length);
}
return(values);
}
function strBetween(string,strBegin,strEnd){ //StrBetween("abcdef","b","e") //return "cd"
var posBegin, posEnd;
posBegin=string.indexOf(strBegin);
string=string.substring(posBegin + strBegin.length);
posEnd=string.indexOf(strEnd);
string=string.substring(0,posEnd);
if ((posBegin==-1)||(posEnd==-1)){
return(null);
}else{
return(string);
}
}
Unless you have a good reason not to use JSON, don't do this. JSON handles all of those problems very well and can be floated around from server to client and vice versa quite easily.
But since this seems fun, I'll try and see if I can whip up an answer.
Since your structure resembles XML, just replace the brackets with < and > and parse it like XML:
text = text.replace('[', '<').replace(']', '>');
if (typeof DOMParser != "undefined") {
var parser = new DOMParser();
var xml = parser.parseFromString(text, 'text/xml');
} else {
var xml = new ActiveXObject('Microsoft.XMLDOM');
xml.async = 'false';
xml.loadXML(text);
}
Now xml holds a DOMDocument that you can parse:
xml.getElementsByTagName('person').childnodes;
Try this possibly-working code (didn't test):
function createObject(element) {
var object = {};
if (element.childNodes.length > 0) {
for (child in element.childnodes) {
object[element.tagName] = createObject(child);
}
return object;
} else {
return element.nodeValue;
}
}
I thought this would be interesting to do without a third-party parser, so I built me a simple one:
function parse(code)
{
var obj = {},
cur = obj,
stack = [];
code.replace(/\[([^\]]+)\]|([^\[]*)/g, function (match, tagName, text) {
if (tagName)
{
if (tagName.charAt(0) == "/")
{
/* end tag */
cur = stack.pop();
}
else
{
/* start tag */
stack.push(cur);
cur = cur[tagName] = {};
}
}
else
{
cur["#text"] = text;
}
});
return obj;
}
var obj = parse(text);
JSON <=> XML http://code.google.com/p/x2js/

Categories