Parse a HTML structure, what JS tools are available - javascript

I have to get out information from a HTML table from a website. I want to do a HTML request from a Node.ja server to that website and parse the HTML table. Are there any libraries or techniques for JS except regular expression to parse the data from the table cells?
Sorry I'm very new in programming.

Look at the excellent Cheerio library:
https://github.com/MatthewMueller/cheerio
Examples are on the Git.

var doc = document.implementation.createDocument(null, your_downloaded_html_page_as_string, null);
You can use normal DOM function like getElementByTagName,firstChild,..etc to get your actual data from the HTML page you downloaded.
Refer Parse a HTML String with JS for more methods.

jsdom is a great module for this
// Count all of the links from the Node.js build page
var jsdom = require("jsdom");
jsdom.env(
"http://nodejs.org/dist/",
["http://code.jquery.com/jquery.js"],
function (errors, window) {
console.log("there have been", window.$("a").length, "nodejs releases!");
}
);

I would use JQuery. You could iterate through all table datas like so: (this will alert the html inside every table data)
$('td').each( function () { alert( $(this).html() } );
or for a specific table:
$('#specific_table_id.td').each( function () { alert( $(this).html() } );

Related

Input Processing in JavaScipt

I'm new to Web Development (including JavaScript and HTML) and have a few issues within my personal project that seem to have no clear fixes.
Overview
My project is taking input from a user on the website, and feeding it to my back-end to output a list of word completion suggestions.
For example, input => "bass", then the program would suggest "bassist", "bassa", "bassalia", "bassalian", "bassalan", etc. as possible completions for the pattern "bass" (these are words extracted from an English dictionary text file).
The backend - running on Node JS libraries
trie.js file:
/* code for the trie not fully shown */
var Deque = require("collections/deque"); // to be used somewhere
function add_word_to_trie(word) { ... }
function get_words_matching_pattern(pattern, number_to_get = DEFAULT_FETCH) { ... }
// read in words from English dictionary
var file = require('fs');
const DICTIONARY = 'somefile.txt';
function preprocess() {
file.readFileSync(DICTIONARY, 'utf-8')
.split('\n')
.forEach( (item) => {
add_word_to_trie(item.replace(/\r?\n|\r/g, ""));
});
}
preprocess();
module.exports = get_words_matching_trie;
The frontend
An HTML script that renders the visuals for the website, as well as getting input from the user and passing it onto the backend script for getting possible suggestions. It looks something like this:
index.html script:
<!DOCTYPE HTML>
<html>
<!-- code for formatting website and headers not shown -->
<body>
<script src = "./trie.js">
function get_predicted_text() {
const autofill_options = get_words_matching_pattern(input.value);
/* add the first suggestion we get from the autofill options to the user's input
arbitrary, because I couldn't get this to actually work. Actual version of
autofill would be more sophisticated. */
document.querySelector("input").value += autofill_options[0];
}
</script>
<input placeholder="Enter text..." oninput="get_predicted_text()">
<!-- I get a runtime error here saying that get_predicted_text is not defined -->
</body>
</html>
Errors I get
Firstly, I get the obvious error of 'require()' being undefined on the client-side. This, I fix using browserify.
Secondly, there is the issue of 'fs' not existing on the client-side, for being a node.js module. I have tried running the trie.js file using node and treating it with some server-side code:
function respond_to_user_input() {
fs.readFile('./index.html', null, (err, html) => {
if (err) throw err;
http.createServer( (request, response) => {
response.write(html);
response.end();
}).listen(PORT);
});
respond_to_user_input();
}
With this, I'm not exactly sure how to edit document elements, such as changing input.value in index.html, or calling the oninput event listener within the input field. Also, my CSS formatting script is not called if I invoke the HTML file through node trie.js command in terminal.
This leaves me with the question: is it even possible to run index.html directly (through Google Chrome) and have it use node JS modules when it calls the trie.js script? Can the server-side code I described above with the HTTP module, how can I fix the issues of invoking my external CSS script (which my HTML file sends an href to) and accessing document.querySelector("input") to edit my input field?

Get Ace Editor's content through the DOM

I am running some tests against a page using Nightwatch.js. I need to get the text content of an Ace Editor that's on the page, so that I can compare it against some JSON.
Is this possible?
Thanks in advance!
Instead, I hooked into the page's angular controller and grabbed the data I needed from there:
'is the Splash-Live schema set up correctly?': function (client) {
var pageSplashLive = client.page.pageSplashLive();
var code;
login(client);
pageSplashLive.navigate()
.waitForElementVisible('#jsonButton', 15000)
.click('#jsonButton')
.waitForElementVisible('.ace_content', 10000)
.api.execute("return angular.element($('.data-editor-form')).scope()['ctrl']['selectedData']['schema']['properties'];", [], function(response) {
code = JSON.stringify(response.value);
console.log(code);
client.assert.equal(code,
'json_goes_here');
});
},

Parse dynamically loaded document on background using createHTMLDocument

Using this post, I'm trying to load document via ajax and find contents of specific document node(s) so that I can display them without re-navigating browser.
However, my document always seems to be an empty document.
Ajax callback:
function processRatingToken(data) { //Data is just standart HTML document string
var doc = document.implementation.createHTMLDocument();
doc.open();
//Replace scripts
data = data.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, "");
//Write HTML to the new document
doc.write(data);
doc.close();
console.log(doc.body); //Empty
}
So what's wrong?
Note: I'm using this strategy, because I'm building a Greasemonkey Userscript. If you are developing an Ajax application, this strategy is NOT recomended. Use JSON instead.
There is a workaround with .innerHTML property:
doc.childNodes[1].innerHTML = data;
Where .childNodes[1] is the <html> element.

WP8 App misbehaving due to StreamWrite in JavaScript

I would like to save the results calculated on html page in a textfile using javascript.
<script type="text/javascript">
window.onload = function () {
var sw : StreamWriter = new StreamWriter("HTML_Results.txt");
sr.Write('xyz");
*** calculations ******
sr.Write (result);
}
</script>
by doing this, my WP8 App is misbehaving and not displaying images as usual. This app is an Image Fader (calculates FPS).
Also tried:
StreamWriter sr;
try {
sr = new StreamWriter("\HTML5\HTMLResults.txt");
sr.Write("xyz");
File.SetAttributes("HTML5\HTMLResults.txt", FileAttributes.Hidden);
} catch(IOException ex) {
console.write ("error writing"); //handling IO
}
The aim is to:
Extract calculated values of several html pages(after getting loaded
one by one) in a single text file.
A Resultant HTML that reads this
text file and displays results in a tabular format.
Is there a better way to this job or the above can be rectified and used? Appreciate help.
Perhaps I've misunderstood your code but it looks like you're trying to write Java within JavaScript scripting tags. You cannot write Java in an HTML document. As far as I know, client-side JavaScript (which given your <script> tags is I guess what you're trying to write) can't perform the kind of file I/O operations you seem to want here.
You need to use Node JS to use JavaScript for something like that and then you're talking server-side. The closest you can get on client-side is using the new localStorage feature in HTML5 (not supported by all browsers).

How do I use jQuery in Windows Script Host?

I'm working on some code that needs to parse numerous files that contain fragments of HTML. It seems that jQuery would be very useful for this, but when I try to load jQuery into something like WScript or CScript, it throws an error because of jQuery's many references to the window object.
What practical way is there to use jQuery in code that runs without a browser?
Update: In response to the comments, I have successfully written JavaScript code to read the contents of files using new ActiveXObject('Scripting.FileSystemObject');. I know that ActiveX is evil, but this is just an internal project to get some data out of some files that contain HTML fragments and into a proper database.
Another Update: My code so far looks about like this:
var fileIo, here;
fileIo = new ActiveXObject('Scripting.FileSystemObject');
here = unescape(fileIo.GetParentFolderName(WScript.ScriptFullName) + "\\");
(function() {
var files, thisFile, thisFileName, thisFileText;
for (files = new Enumerator(fileIo.GetFolder(here).files); !files.atEnd(); files.moveNext()) {
thisFileName = files.item().Name;
thisFile = fileIo.OpenTextFile(here + thisFileName);
thisFileText = thisFile.ReadAll();
// I want to do something like this:
s = $(thisFileText).find('input#txtFoo').val();
}
})();
Update: I posted this question on the jQuery forums as well: http://forum.jquery.com/topic/how-to-use-jquery-without-a-browser#14737000003719577
Following along with your code, you could create an instance of IE using Windows Script Host, load your html file in to the instance, append jQuery dynamically to the loaded page, then script from that.
This works in IE8 with XP, but I'm aware of some security issues in Windows 7/IE9. IF you run into problems you could try lowering your security settings.
var fileIo, here, ie;
fileIo = new ActiveXObject('Scripting.FileSystemObject');
here = unescape(fileIo.GetParentFolderName(WScript.ScriptFullName) + "\\");
ie = new ActiveXObject("InternetExplorer.Application");
ie.visible = true
function loadDoc(src) {
var head, script;
ie.Navigate(src);
while(ie.busy){
WScript.sleep(100);
}
head = ie.document.getElementsByTagName("head")[0];
script = ie.document.createElement('script');
script.src = "http://ajax.googleapis.com/ajax/libs/jquery/1.8.3/jquery.min.js";
head.appendChild(script);
return ie.document.parentWindow;
}
(function() {
var files, thisFile, win;
for (files = new Enumerator(fileIo.GetFolder(here).files); !files.atEnd(); files.moveNext()) {
thisFile = files.item();
if(fileIo.GetExtensionName(thisFile)=="htm") {
win = loadDoc(thisFile);
// your jQuery reference = win.$
WScript.echo(thisFile + ": " + win.$('input#txtFoo').val());
}
}
})();
This is pretty easy to do in Node.js with the cheerio package. You can read in arbitrary HTML from whatever source you want, parse it with cheerio and then access the parsed elements using jQuery style selectors.

Categories