Parsing Data from HTML without IDs

Parsing Data from HTML without IDs - javascript

I want to parse (with Javascript or JSoup) an Website.
My Problem is that I don´t knew how to access the wanted data, because in that file are practically no Ids.
I have something like:
<div id content>
<table>
<tbody>
<tr>
<td align >
<div style=>
<table>
<tbody>
<tr></tr>
<tr></tr>
<tr>
<td>
<br></br>
<h2><div class=""></div>Related</h2>
Adaptation:
nameOfBook
<br></br>
Prequel:
nameOfBook2
<br></br>
Other:
<br></br>
<br></br>
<h2></h2>
<table width0"></table>
..........many tables and a.....
</tr>
</tbody>
</table>
</div>
</td>
</tr>
</tbody>
Hopefully its understandable, the site is quite big.
I want the Stuff after Related.
So I want the Sequel connected with the three names and their links.
And then the Prequel name3.
At the moment I get the #content then I get the Array with all h2 Tags and check the 2nd child, if it equals to "Related".
Then I get the parent (td) and iterate over all "a".
In this one td are over 200 a´s.
My plan was now to iterate over those and check if before that "a" comes the term (prequel, sequel or adaption), but it sounds a little bit complicated.
Or I could parse everything between the two h2 Tags, because it´s always there. Or, I could check the link, because the wanted ones have always the same structure. So, search for that structure and then go to the parent and check what term it is.
Anyone can help me with that? In the whole document are no id´s or names.
I am pretty sure, that I can find an workaround for that, but it would be just too complicated and with some JS Knowledge easy to get.
UPDAtE:
It´s not known how many Prequel/Sequel whatever Tags will be there.
The only thing I really knew is that there will be an "Related" Text between two h2-Tags and the next beginning h2 is the start of something new.
And changed the above example: now it´s the correct Structure, #content is again in an div, but I guess that´s not important because I can access content directly.

You can use document.querySelector or document.querySelectorAll and select the element in a relative manner.
For example:
to select first three a tags within div[id='content']
var allAnchorsInDiv = document.querySelectorAll("div[id='content'] a"); //Basically this is an array of anchors.
//select anchors from array.
If you don't have any Ids at all, then you should probably use a relative path (something like Xpath or CSS selector).
Using CSS selector you will use something like this,
document.querySelectorAll('body>div:first-of-type>a');
Or you could use XPath, refer w3school
Note: If you want things a little easier you could even use jquery to accomplish the same.
Update:
So, for your need you have to do this.
Select the text node with the text.
Find the node anchor nodes next to it.
Thus,
var myKeyTerm = "Sequel"; //Set your keyterm here.
var myAnchorTags = [];
var myTextNode = document.evaluate("//text()[contains(., '"+myKeyTerm +"')]" ,document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null ).singleNodeValue;
if(myTextNode) {
var nextNode = myTextNode;
do {
nextNode = nextNode.nextSibling;
if(nextNode && nextNode.nodeName == "A") {
myAnchorTags.push(nextNode);
}
else nextNode = null;
}
while(nextNode);
}
//All the nodes that follow your required text is in myAnchorTags array.

My take on this would be:
var content = document.getElementById("content");
var h2 = content.getElementsByTagName("h2")[0]; // the first h2 element
var link1 = h2.nextElementSibling;
var link2 = link1.nextElementSibling;
var link3 = link2.nextElementSibling;
var link4 = link3.nextElementSibling;
console.log("Sequel: ", link1.innerHTML, link1.href);
console.log("Sequel: ", link2.innerHTML, link2.href);
console.log("Sequel: ", link3.innerHTML, link3.href);
console.log("Prequel: ", link4.innerHTML, link4.href);
This method has the advantage of working even if there are links inside the first (stripped out) table.
But it won't work if the first (stripped out) table contains h2 elements... In that case, instead of
var h2 = content.getElementsByTagName("h2")[0]; // the first h2 element
You should use
var h2 = Array.prototype.filter.call(content.children, function(c) {return c.tagName.toLowerCase() == "h2"})[0];
EDIT
function listlinks(){
var prequels = [];
var sequels = [];
var all_h2_elems = document.getElementsByTagName("h2");
var h2_start = Array.prototype.filter.call(all_h2_elems, function(el){return el.innerText.indexOf("Related") != -1})[0];
var parent = h2_start.parentElement;
var h2_elems = Array.prototype.filter.call(parent.children, function(c) {return c.tagName.toLowerCase() == "h2"});
if (h2_elems.length < 2) console.log("You lied, you said there were always 2 h2 tags!");
if (!h2_start.isSameNode(h2_elems[0])) console.log("Hmmm, there should not be a h2 tag before the 'Related' one, fix your question.");
var sequel = false;
var prequel = false;
var current = h2_elems[0];
var end = h2_elems[1]
while(current && !current.isSameNode(end))
{
if (current.tagName === undefined)
{
if (current.nodeValue.indexOf("Sequel") != -1)
{
if (sequel || prequel) { console.log("wtf? another sequel?"); break; }
sequel = true;
}
else if (current.nodeValue.indexOf("Prequel") != -1)
{
if (!sequel) { console.log("wtf? prequel should be AFTER sequel"); break; }
prequel = true;
sequel = false;
}
else if (current.nodeValue.match(/[a-z]/)){
prequel = false;
sequel = false;
// stop outputing links if anything else is found
}
} // end if (current.tagName === undefined)
else if (current.tagName.toLowerCase() === "a")
{
if (prequel) prequels.push(current.innerHTML + " : " + current.href);
if (sequel) sequels.push(current.innerHTML + " : " + current.href);
}
current = current.nextSibling;
}
return [prequels,sequels];
}
listlinks().forEach(function(el,i){console.log(i?"Sequels:":"Prequels:",el)})

Related

Highlighting and formatting code snippets using JavaScript

I'm looking for a way to highlight and format code snippets passed as string for a live style guide. I'm playing around with highlighjs and prettify. They are really helpful and easy for highlighting, but I can't seem to figure out a way to format or whether they can actually do that or not.
By formatting, I mean tabs and newlines to make code legible. I need to pass code as a string to automate the output of dust template I'm using for the style guide.
That is, I want to pass:
"<table><tr><td class="title">Name</td><td class="title">Category</td><td class="title">Results</td></tr></table>"
And get something like:
<table>
<tr>
<td class="title">Name</td>
<td class="title">Category</td>
<td class="title">Results</td>
</tr>
</table>
Any ideas on how to accomplish this?
Thanks!

You could parse this as HTML into a DOM and than traverse every element writing it out and indenting it with every iteration.
This code will do the job. Feel free to use it and surely to improve it. It's version 0.0.0.1.
var htmlString = '<table><tr><td class="title">Name</td><td class="title">Category</td><td class="title">Results</td></tr></table>';
//create a containing element to parse the DOM.
var documentDOM = document.createElement("div");
//append the html to the DOM element.
documentDOM.insertAdjacentHTML('afterbegin', htmlString);
//create a special HTML element, this shows html as normal text.
var documentDOMConsole = document.createElement("xmp");
documentDOMConsole.style.display = "block";
//append the code display block.
document.body.appendChild(documentDOMConsole);
function indentor(multiplier)
{
//indentor handles the indenting. The multiplier adds \t (tab) to the string per multiplication.
var indentor = "";
for (var i = 0; i < multiplier; ++i)
{
indentor += "\t";
}
return indentor;
}
function recursiveWalker(element, indent)
{
//recursiveWalker walks through the called DOM recursively.
var elementLength = element.children.length; //get the length of the children in the parent element.
//iterate over all children.
for (var i = 0; i < elementLength; ++i)
{
var indenting = indentor(indent); //set indenting for this iteration. Starts with 1.
var elements = element.children[i].outerHTML.match(/<[^>]*>/g); //retrieve the various tags in the outerHTML.
var elementTag = elements[0]; //this will be opening tag of this element including all attributes.
var elementEndTag = elements[elements.length-1]; //get the last tag.
//write the opening tag with proper indenting to the console. end with new line \n
documentDOMConsole.innerHTML += indenting + elementTag + "\n";
//get the innerText of the top element, not the childs using the function getElementText
var elementText = getElementText(element.children[i]);
//if the texts length is greater than 0 put the text on the page, else skip.
if (elementText.length > 0)
{
//indent the text one more tab, end with new line.
documentDOMConsole.innerHTML += (indenting + indentor(1) ) + elementText+ "\n";
}
if (element.children[i].children.length > 0)
{
//when the element has children call function recursiveWalker.
recursiveWalker(element.children[i], (indent+1));
}
//if the start tag matches the end tag, write the end tag to the console.
if ("<"+element.children[i].nodeName.toLowerCase()+">" == elementEndTag.replace(/\//, ""))
{
documentDOMConsole.innerHTML += indenting + elementEndTag + "\n";
}
}
}
function getElementText(el)
{
child = el.firstChild,
texts = [];
while (child) {
if (child.nodeType == 3) {
texts.push(child.data);
}
child = child.nextSibling;
}
return texts.join("");
}
recursiveWalker(documentDOM, 1);
http://jsfiddle.net/f2L82m8h/

Select element by content

Within javascript, are you able to target elements on a page via psuedo classes? For example:
<table id="loginInnerTable">
<tbody>
<tr>
<td colspan="3">
<span class="required">*</span><span> = required</span>
</td>
</tr>
<tr>
<td>User ID <span class="required">*</span></td>
</tr>
</tbody>
</table>
How would I target the 'User ID' text to change it with .innerHTML? This is an example of a form that I don't have access to the code itself, except via JS.
Otherwise, what is the best method?

You can always make use of DOM traversal and HTML DOM extensions. This depends on your actual markup though. In this specific case you can do the following:
Get a reference to the table:
var table = document.getElementById('loginInnerTable');
Get a reference to the second row:
var row = table.rows[1];
Get a reference to the first cell:
var cell = row.cells[0];
Change the value of the first child node (works because it is a text node):
cell.firstChild.nodeValue = 'Some new text';
Or iterate over all text nodes to find the right one (if you have mixed element and text nodes or the node you want to change is not the first child):
var node = cell.firstChild;
do {
if (node.nodeType === 3 && node.nodeValue.indexOf('User ID') > -1) {
node.nodeValue = 'Some new text';
break;
}
} while(node = node.nextSibling);
If you don't want to restrict your solution too much to the current structure (it might change from time to time), then iterating over all rows/cells and find the desired cell would be a more flexible approach. Karl-André Gagnon showed this in his answer.

How about something like this to select the element, then just use it as a reference to update using innerHTML
var userTD = document.getElementById('loginInnerTable').getElementsByTagName('tbody')[0].getElementsByTagName('tr')[1].getElementsByTagName('td');
I'm not sure how efficient it is, but it should do the trick.
As others have mentioned it's not massively reliable - as if the table structure changes (which you say is out of your control) you'll have to change your code to work again.

Here a simple code that will change the text :
var tdEl = document.getElementsByTagName('td')
for(var cpt = 0; cpt < tdEl.length; cpt++){
if(tdEl[cpt].innerHTML.indexOf('User ID') != -1){
tdEl[cpt].innerHTML = 'some text <span class="required">*</span>';
}
}
If you want to make it a function (to reuse the code) :
changeNode('td', 'User ID', 'some text <span class="required">*</span>')
function changeNode(node, text, replace){
var el = document.getElementsByTagName(node)
for(var cpt = 0; cpt < el.length; cpt++){
if(el[cpt].innerHTML.indexOf(text) != -1){
el[cpt].innerHTML = replace;
}
}
}

How would I target the 'User ID' text to change it with .innerHTML?
You won't. Of course you could do something like
var tab = document.getElementById('loginInnerTable');
tab.innerHTML = tab.innerHTML.replace("User ID", "something else");
but that would replace the whole table, crushing and re-parsing its DOM (and it won't work in IE, of course).
Instead, you will use DOM manipulation to get and change that text node:
var tab = document.getElementById('loginInnerTable'),
td = tab.rows[1].cells[0],
text = td.firstChild;
text.nodeValue = "something else";

Search for non-nested nodes with specific attribute set

The short long of it is I'm working on a small library in javascript that will replace <div src="somesite"></div> with the content from the specified source. This would allow coders to create dynamic pages without having to do more work server-side without the annoyance of using iframes.
What I need is an efficent way to get the top most div nodes of a branch with an src attribute. E.G:
<div src="somesite/pagelet.htm" id="div1">
<div src="somesite/fallback.htm" id="div2"></div>
</div>
<div src="somesite/pagelet2.htm" id="div3"></div>
I want to retrieve #div1 and #div3 and ignore #div2 until later. At the moment I'm using the following function, but am wondering if there is a more efficent way to do this:
function getRootElementsByAttribute(rootEle, tag, attr) {
try {
tag = tag.toLowerCase();
if (rootEle.tagName.toLowerCase() === tag && rootEle.hasAttribute(attr)) {
return [rooEle]
}
var eles = rootEle.getElementsByTagName(tag),
nodes = [], ele, isRoot, eleParent, a;
for (a=0; a<eles.length; a++) {
ele = eles[a];
if (ele.hasAttrinute(attr)) {
isRoot = true;
eleParent = ele;
while ((eleParent = eleParent.parentNode)) {
if (eleParent.tagName.toLowerCase() === 'div' && eleParent.hasAttribute(attr)) {
isRoot = false;
break;
}
}
if (isRoot == true) nodes.push(ele)
}
}
}catch(e){}
return nodes;
}
Please no answers suggesting the use of a library. It seems overkill to import a whole library when all it would be used for is this single function

You could try to use an XPath expression to get all root divs with the attribute source using something like the following XPath expression:
/div[#src]
/div selects all divs that are on the root level. For all divs in the document use //div.
[#src] specifies that you only want nodes with the 'src' attribute.
var xmlDoc = //load your document here
var xpath = "/div[#src]"
var nodes = xmlDoc.evaluate(xpath, xmlDoc, null, XPathResult.ANY_TYPE,null);

How can I make greasemonkey search only within a table

Below is my code and currently it searches the whole webpage. I'm trying to figure out how to make it search only within a table. (There is only one table on the page).
Any help would be appreciated.
var TargetLink = $("a:contains('gg')");
var TargetSink = $("a:contains('u')");
if (TargetLink && TargetLink.length)
{
window.location.href = TargetLink[0].href;
}
else if (TargetSink && TargetSink.length)
{
window.location.href = TargetSink[0].href;
}

var TargetLink = $("table a:contains('gg')");
var TargetSink = $("table a:contains('u')");
EDIT:
You say there is only one table on the page. Do you absolutely know there will only ever be one table? Even if you think the answer is yes, I would try and add an id or class selector so that things won't break in the future.
Also, the following code can be simplified:
if (TargetLink && TargetLink.length)
to:
if (TargetLink.length)
Re: "could I combine those 2 variables into 1":
Use a comma in the selector, like so:
//--- Need more of the HTML structure for a better selector.
var TargetLink = $("table")
.find ("a:contains('gg'), a:contains('u')")
;
if (TargetLink.length) {
window.location.href = TargetLink[0].href;
}
If both kind of links are found, 'gg' will be used (first).

how to dynamically address a word/string with javascript in an html-document and then tag it?

I have an HTML-document:
<html>
<body>
<p>
A funny joke:
<ul>
<li>do you know why six is afraid of seven?
<li>because seven ate nine.
</ul>
Oh, so funny!
</p>
</body>
</html>
Now I want to identify the first occurence of "seven" and tag it with
<span id="link1" class="link">
How can this be accomplished?
Do you have to parse the DOM-tree or is it possible to get the whole code within the body-section and then search for the word?
In both cases, after I found the word somewhere, how do you identify it and change it's DOM-parent to span (I guess that's what has to be done) and then add the mentioned attributes?
It's not so much a code I would expect, but what methods or concepts will do the job.
And I am not so much intersted in a framework-solution but in a pure javascript way.

You need to find a DOM node with type TEXT_NODE (3) and containig your expected word. When you need to split a that node into three ones.
First is a TEXT_NODE which contains a text before the word you search, second one is a SPAN node containing the word you search, and third one is another TEXT_NODE containing an original node's tail (all after searched word).
Here is a source code...
<html>
<head>
<style type="text/css">
.link {
color: red;
}
</style>
</head>
<body>
<p>
A funny joke:
<ul>
<li>do you know why six is afraid of seven?
<li>because seven ate nine.
</ul>
Oh, so funny!
</p>
<script type="text/javascript">
function search(where, what) {
var children = where.childNodes;
for(var i = 0, l = children.length; i < l; i++) {
var child = children[i], pos;
if(child.nodeType == 3 && (pos = child.nodeValue.indexOf(what)) != -1) { // a TEXT_NODE
var value = child.nodeValue;
var parent = child.parentNode;
var start = value.substring(0, pos);
var end = value.substring(pos + what.length);
var span = document.createElement('span');
child.nodeValue = start;
span.className = 'link';
span.id = 'link1';
span.innerHTML = what;
parent.appendChild(span);
parent.appendChild(document.createTextNode(end));
return true;
} else
if(search(child, what))
break;
}
return false;
}
search(document.getElementsByTagName('body')[0], 'seven');
</script>
</body>
</html>

This is a function I’ve written a few years ago that searches for specific text, and highlights them (puts the hits in a span with a specific class name).
It walks the DOM tree, examining the text content. Whenever it finds a text node containing the looked-for text, it will replace that text node by three new nodes:
one text node with the text preceding the match,
one (newly created) span element containing the matching text,
and one text node with the text following the match.
This is the function as I have it. It’s part of a larger script file, but it should run independently as well. (I’ve commented out a call to ensureElementVisible which made the element visible, since the script also had folding and expanding capabilities).
It does one (other) thing that you probably won’t need: it turns the search text into a regular expression matching any of the multiple words.
function findText(a_text, a_top) {
// Go through *all* elements below a_top (if a_top is undefined, then use the body)
// and check the textContent or innerText (only if it has textual content!)
var rexPhrase = new RegExp(a_text.replace(/([\\\/\*\?\+\.\[\]\{\}\(\)\|\^\$])/g, '\\$1').replace(/\W+/g, '\\W*')
, 'gi');
var terms = [];
var rexSearchTokens = /[\w]+/g;
var match;
while(match = rexSearchTokens.exec(a_text)) {
terms.push(match[0]);
}
var rexTerm = new RegExp('\\b(' + terms.join('|') + ')', 'gi');
var hits = [];
walkDOMTree(a_top || document.body,
function search(a_element) {
if (a_element.nodeName === '#text') {
if(rexPhrase.test(a_element.nodeValue)) {
// ensureElementVisible(a_element, true);
hits.push(a_element);
}
}
});
// highlight the search terms in the found elements
for(var i = 0; i < hits.length; i++) {
var hit = hits[i];
var parent = hit.parentNode;
if (parent.childNodes.length === 1) {
// Remove the element from the hit list
hits.splice(i, 1);
var text = hit.nodeValue;
parent.removeChild(hit);
var match, prevLastIndex = 0;
while(match = rexTerm.exec(text)) {
parent.appendChild(document.createTextNode(text.substr(prevLastIndex, match.index - prevLastIndex)));
var highlightedTerm = parent.appendChild(document.createElement('SPAN'));
highlightedTerm.className = 'search-hit';
highlightedTerm.appendChild(document.createTextNode(match[0]));
prevLastIndex = match.index + match[0].length;
// Insert the newly added element into the hit list
hits.splice(i, 0, highlightedTerm);
i++;
}
parent.appendChild(document.createTextNode(text.substr(prevLastIndex)));
// Account for the removal of the original hit node
i--;
}
}
return hits;
}

I found the following so question:
Find text string using jQuery?
This appears to be close to what you're trying to do. Now are you attempting to wrap just the text "seven" or are you attempting to wrap the entire content of the <li>?

We Keep Coding

JavaScript is the programming language of the Web.

Parsing Data from HTML without IDs - javascript

Related

Highlighting and formatting code snippets using JavaScript

Select element by content

Search for non-nested nodes with specific attribute set

How can I make greasemonkey search only within a table

how to dynamically address a word/string with javascript in an html-document and then tag it?

Categories

Resources