Is it safe to validate a URL with a regexp? - javascript

In my web app I've got a form field where the user can enter an URL. I'm already doing some preliminary client-side validation and I was wondering if I could use a regexp to validate if the entered string is a valid URL. So, two questions:
Is it safe to do this with a regexp? A URL is a complex beast, and just like you shouldn't use a regexp for parsing HTML, I'm worried that it might be unsuitable for a URL as well.
If it can be done, what would be a good regexp for the task? (I know that Google turns up countless regexps, but I'm worried about their quality).
My goal is to prevent a situation where the URL appears in the web page and is unusable by the browser.

Well... maybe. People often ask a similar question about email addresses, and with those you would need a horrendously complicated regular expression (i.e. a couple pages long, at least) to correctly validate them. I don't think URLs are quite as complicated (the W3C has a document describing their format) but still, any reasonably short regexp you come up with will probably block some valid URLs.
I would suggest thinking about what kinds of URLs you need to be accepting. Maybe for your purposes, blocking the occasional valid-but-weird submission is fine, and in that case you can use a simple regex that matches most URLs, like the one in Dobiatowski's answer. Or you could use a regex that accepts all valid URLs and a few invalid ones, if that works for you. But I'd be wary of trying to find a regular expression that accepts exactly all valid URLs and no invalid ones. If you want to have 100% foolproof verification in that way, I'd suggest using a client-side validation of the second type I mentioned (that accepts a few invalid URLs) and doing a more comprehensive check on the server side, using some library in whatever language you are using to process the form data.

As stated by Crescent Fresh, in the comments, there are similar questions to this one which I didn't find. One of them also supplies the full standards-compliant regex for validating an URL:
(?:http://(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?)\.
)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:\d+)
){3}))(?::(?:\d+))?)(?:/(?:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F
\d]{2}))|[;:#&=])*)(?:/(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{
2}))|[;:#&=])*))*)(?:\?(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{
2}))|[;:#&=])*))?)?)|(?:ftp://(?:(?:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?
:%[a-fA-F\d]{2}))|[;?&=])*)(?::(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-
fA-F\d]{2}))|[;?&=])*))?#)?(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-
)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?
:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+))?))(?:/(?:(?:(?:(?:[a-zA-Z\d$\-_.+!
*'(),]|(?:%[a-fA-F\d]{2}))|[?:#&=])*)(?:/(?:(?:(?:[a-zA-Z\d$\-_.+!*'()
,]|(?:%[a-fA-F\d]{2}))|[?:#&=])*))*)(?:;type=[AIDaid])?)?)|(?:news:(?:
(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[;/?:&=])+#(?:(?:(
?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:[
a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:\d+)){3})))|(?:[a-zA-Z](
?:[a-zA-Z\d]|[_.+-])*)|\*))|(?:nntp://(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[
a-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d
])?))|(?:(?:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+))?)/(?:[a-zA-Z](?:[a-zA-Z
\d]|[_.+-])*)(?:/(?:\d+))?)|(?:telnet://(?:(?:(?:(?:(?:[a-zA-Z\d$\-_.+
!*'(),]|(?:%[a-fA-F\d]{2}))|[;?&=])*)(?::(?:(?:(?:[a-zA-Z\d$\-_.+!*'()
,]|(?:%[a-fA-F\d]{2}))|[;?&=])*))?#)?(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[a
-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d]
)?))|(?:(?:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+))?))/?)|(?:gopher://(?:(?:
(?:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:
(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+
))?)(?:/(?:[a-zA-Z\d$\-_.+!*'(),;/?:#&=]|(?:%[a-fA-F\d]{2}))(?:(?:(?:[
a-zA-Z\d$\-_.+!*'(),;/?:#&=]|(?:%[a-fA-F\d]{2}))*)(?:%09(?:(?:(?:[a-zA
-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[;:#&=])*)(?:%09(?:(?:[a-zA-Z\d$
\-_.+!*'(),;/?:#&=]|(?:%[a-fA-F\d]{2}))*))?)?)?)?)|(?:wais://(?:(?:(?:
(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:
[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+))?
)/(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))*)(?:(?:/(?:(?:[a-zA
-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))*)/(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(
?:%[a-fA-F\d]{2}))*))|\?(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]
{2}))|[;:#&=])*))?)|(?:mailto:(?:(?:[a-zA-Z\d$\-_.+!*'(),;/?:#&=]|(?:%
[a-fA-F\d]{2}))+))|(?:file://(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]
|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:
(?:\d+)(?:\.(?:\d+)){3}))|localhost)?/(?:(?:(?:(?:[a-zA-Z\d$\-_.+!*'()
,]|(?:%[a-fA-F\d]{2}))|[?:#&=])*)(?:/(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(
?:%[a-fA-F\d]{2}))|[?:#&=])*))*))|(?:prospero://(?:(?:(?:(?:(?:[a-zA-Z
\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)
*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+))?)/(?:(?:(?:(?
:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[?:#&=])*)(?:/(?:(?:(?:[a-
zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[?:#&=])*))*)(?:(?:;(?:(?:(?:[
a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[?:#&])*)=(?:(?:(?:[a-zA-Z\d
$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[?:#&])*)))*)|(?:ldap://(?:(?:(?:(?:
(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:
[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+))?
))?/(?:(?:(?:(?:(?:(?:(?:[a-zA-Z\d]|%(?:3\d|[46][a-fA-F\d]|[57][Aa\d])
)|(?:%20))+|(?:OID|oid)\.(?:(?:\d+)(?:\.(?:\d+))*))(?:(?:%0[Aa])?(?:%2
0)*)=(?:(?:%0[Aa])?(?:%20)*))?(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F
\d]{2}))*))(?:(?:(?:%0[Aa])?(?:%20)*)\+(?:(?:%0[Aa])?(?:%20)*)(?:(?:(?
:(?:(?:[a-zA-Z\d]|%(?:3\d|[46][a-fA-F\d]|[57][Aa\d]))|(?:%20))+|(?:OID
|oid)\.(?:(?:\d+)(?:\.(?:\d+))*))(?:(?:%0[Aa])?(?:%20)*)=(?:(?:%0[Aa])
?(?:%20)*))?(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))*)))*)(?:(
?:(?:(?:%0[Aa])?(?:%20)*)(?:[;,])(?:(?:%0[Aa])?(?:%20)*))(?:(?:(?:(?:(
?:(?:[a-zA-Z\d]|%(?:3\d|[46][a-fA-F\d]|[57][Aa\d]))|(?:%20))+|(?:OID|o
id)\.(?:(?:\d+)(?:\.(?:\d+))*))(?:(?:%0[Aa])?(?:%20)*)=(?:(?:%0[Aa])?(
?:%20)*))?(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))*))(?:(?:(?:
%0[Aa])?(?:%20)*)\+(?:(?:%0[Aa])?(?:%20)*)(?:(?:(?:(?:(?:[a-zA-Z\d]|%(
?:3\d|[46][a-fA-F\d]|[57][Aa\d]))|(?:%20))+|(?:OID|oid)\.(?:(?:\d+)(?:
\.(?:\d+))*))(?:(?:%0[Aa])?(?:%20)*)=(?:(?:%0[Aa])?(?:%20)*))?(?:(?:[a
-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))*)))*))*(?:(?:(?:%0[Aa])?(?:%2
0)*)(?:[;,])(?:(?:%0[Aa])?(?:%20)*))?)(?:\?(?:(?:(?:(?:[a-zA-Z\d$\-_.+
!*'(),]|(?:%[a-fA-F\d]{2}))+)(?:,(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-f
A-F\d]{2}))+))*)?)(?:\?(?:base|one|sub)(?:\?(?:((?:[a-zA-Z\d$\-_.+!*'(
),;/?:#&=]|(?:%[a-fA-F\d]{2}))+)))?)?)?)|(?:(?:z39\.50[rs])://(?:(?:(?
:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?
:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+))
?)(?:/(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))+)(?:\+(?:(?:
[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))+))*(?:\?(?:(?:[a-zA-Z\d$\-_
.+!*'(),]|(?:%[a-fA-F\d]{2}))+))?)?(?:;esn=(?:(?:[a-zA-Z\d$\-_.+!*'(),
]|(?:%[a-fA-F\d]{2}))+))?(?:;rs=(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA
-F\d]{2}))+)(?:\+(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))+))*)
?))|(?:cid:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[;?:#&=
])*))|(?:mid:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[;?:#
&=])*)(?:/(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[;?:#&=]
)*))?)|(?:vemmi://(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z
\d])?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\
.(?:\d+)){3}))(?::(?:\d+))?)(?:/(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a
-fA-F\d]{2}))|[/?:#&=])*)(?:(?:;(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a
-fA-F\d]{2}))|[/?:#&])*)=(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d
]{2}))|[/?:#&])*))*))?)|(?:imap://(?:(?:(?:(?:(?:(?:(?:[a-zA-Z\d$\-_.+
!*'(),]|(?:%[a-fA-F\d]{2}))|[&=~])+)(?:(?:;[Aa][Uu][Tt][Hh]=(?:\*|(?:(
?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[&=~])+))))?)|(?:(?:;[
Aa][Uu][Tt][Hh]=(?:\*|(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2
}))|[&=~])+)))(?:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[
&=~])+))?))#)?(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])
?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:
\d+)){3}))(?::(?:\d+))?))/(?:(?:(?:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:
%[a-fA-F\d]{2}))|[&=~:#/])+)?;[Tt][Yy][Pp][Ee]=(?:[Ll](?:[Ii][Ss][Tt]|
[Ss][Uu][Bb])))|(?:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))
|[&=~:#/])+)(?:\?(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[
&=~:#/])+))?(?:(?:;[Uu][Ii][Dd][Vv][Aa][Ll][Ii][Dd][Ii][Tt][Yy]=(?:[1-
9]\d*)))?)|(?:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[&=~
:#/])+)(?:(?:;[Uu][Ii][Dd][Vv][Aa][Ll][Ii][Dd][Ii][Tt][Yy]=(?:[1-9]\d*
)))?(?:/;[Uu][Ii][Dd]=(?:[1-9]\d*))(?:(?:/;[Ss][Ee][Cc][Tt][Ii][Oo][Nn
]=(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[&=~:#/])+)))?))
)?)|(?:nfs:(?:(?://(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-
Z\d])?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:
\.(?:\d+)){3}))(?::(?:\d+))?)(?:(?:/(?:(?:(?:(?:(?:[a-zA-Z\d\$\-_.!~*'
(),])|(?:%[a-fA-F\d]{2})|[:#&=+])*)(?:/(?:(?:(?:[a-zA-Z\d\$\-_.!~*'(),
])|(?:%[a-fA-F\d]{2})|[:#&=+])*))*)?)))?)|(?:/(?:(?:(?:(?:(?:[a-zA-Z\d
\$\-_.!~*'(),])|(?:%[a-fA-F\d]{2})|[:#&=+])*)(?:/(?:(?:(?:[a-zA-Z\d\$\
-_.!~*'(),])|(?:%[a-fA-F\d]{2})|[:#&=+])*))*)?))|(?:(?:(?:(?:(?:[a-zA-
Z\d\$\-_.!~*'(),])|(?:%[a-fA-F\d]{2})|[:#&=+])*)(?:/(?:(?:(?:[a-zA-Z\d
\$\-_.!~*'(),])|(?:%[a-fA-F\d]{2})|[:#&=+])*))*)?)))
Obviously this is as insane as I feared it would be, so I'll be rethinking the whole thing.
So, the answer is - yes, it can be done, but you should REALLY think twice whether you want to do it this way. Or accept that the regex will be imperfect.

Regex's are safe for lexical validity, but it doesn't mean the site is going to be there. You will actually have to test the connection to see if it's a valid URL by checking the response returned. In all, it depends on your user requirements to say what is valid and what is not - how safe/secure it is depends on you. If you have something like http://foo.com/?referral=http://bar.com/, it'll break some scripts because users don't expect another protocol/path combo as a parameter. Also, some other special characters and null byte hacks have been known to do fishy things with parameters, but I don't think there have been any successful Regex hacks - perhaps memory overflows?
If this is something that needs to be secure, it should probably be handled server side. Although you can perform server side Javascript, I would probably recommend Perl, since it was designed/developed as a text-based parser.
The Regex is only as advanced, or as limited, as you make it. Humans are logical, therefore the problems they solve can be solved by a logic engine (computers), provided that accurate instructions (in this case the regular expression pattern) are given to follow.
#Kerry:
In Javascript you don't "have to" put '/'s around the regular expression. There are also conditions where you can put it in quotes: var re = new Regexp("\w+");
Web Examples:
I think Kerry's was pretty nice, though I only gave it a glance w/o checking it, but here are some simpler examples found from the web
http://www.javascriptkit.com/script/script2/acheck.shtml:
// Email Check
var filter=/^([\w-]+(?:\.[\w-]+)*)#((?:[\w-]+\.)*\w[\w-]{0,66})\.([a-z]{2,6}(?:\.[a-z]{2})?)$/i
if (filter.test("email address or variable here")){...}
http://snippets.dzone.com/posts/show/452:
// URL Validation
function isUrl(s) {
var regexp = /(ftp|http|https):\/\/(\w+:{0,1}\w*#)?(\S+)(:[0-9]+)?(\/|\/([\w#!:.?+=&%#!\-\/]))?/
return regexp.test(s);
}
Finally, this looks promising.
http://www.weberdev.com/get_example-4569.html:
// URL Validation
function isValidURL(url){
var RegExp = /^(([\w]+:)?\/\/)?(([\d\w]|%[a-fA-f\d]{2,2})+(:([\d\w]|%[a-fA-f\d]{2,2})+)?#)?([\d\w][-\d\w]{0,253}[\d\w]\.)+[\w]{2,4}(:[\d]+)?(\/([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)*(\?(&?([-+_~.\d\w]|%[a-fA-f\d]{2,2})=?)*)?(#([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)?$/;
if(RegExp.test(url)){
return true;
}else{
return false;
}
}
// Email Validation
function isValidEmail(email){
var RegExp = /^((([a-z]|[0-9]|!|#|$|%|&|'|\*|\+|\-|\/|=|\?|\^|_|`|\{|\||\}|~)+(\.([a-z]|[0-9]|!|#|$|%|&|'|\*|\+|\-|\/|=|\?|\^|_|`|\{|\||\}|~)+)*)#((((([a-z]|[0-9])([a-z]|[0-9]|\-){0,61}([a-z]|[0-9])\.))*([a-z]|[0-9])([a-z]|[0-9]|\-){0,61}([a-z]|[0-9])\.)[\w]{2,4}|(((([0-9]){1,3}\.){3}([0-9]){1,3}))|(\[((([0-9]){1,3}\.){3}([0-9]){1,3})\])))$/
if(RegExp.test(email)){
return true;
}else{
return false;
}
}

i think that its safe
here is one sample
http://snippets.dzone.com/posts/show/452

I do believe it is safe, and on #1, that isn't a steadfast rule but more a guideline, speaking from personal experience.
This is the one I use to validate a URL:
(([\w]+:)?\/\/)(([\d\w]|%[a-fA-f\d]{2,2})+(:([\d\w]|%[a-fA-f\d]{2,2})+)?#)?([\d\w][-\d\w]{0,253}[\d\w]\.)+[\w]{2,4}(:[\d]+)?(\/([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)*(\?(&?([-+_~.\d\w]|%[a-fA-f\d]{2,2})=?)*)?(#([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)?
Realize that in javascript you have to put '/'s around the regular expression.

Related

How to cut off function from global scope

I have an idea for a game where people can type in some simple instructions for their character like player.goLeft() or player.attackInFront() and for that I have people type their code into a text box and then I parse it into eval(). This works well but it also allows people to change their own character object by typing things like player.health = Infinity; or something similar. I have a list of functions I want to allow people to use, but I am unsure how to restrict it to only use them.
I understand that the whole point of not letting people use eval is to avoid accidental cross-site scripting but I am unsure on how else to do this. If you have a suggestion please leave a comment about that.
I asked some people around on what to do and most suggested somehow changing scope(which is something I was not able to figure out) or to add some odd parameter to each function in my code that would be required to be a specific string to execute any function, but that seems hacky and since I am making the game in browser with p5js it would be easy to just inspect element and see what the password is.
basically every character has variable called "instruction" which is just a string of javascript. Then every frame of the game I execute it by doing eval(playerList[i].instruction);
tl;dr, how can I only allow specific function to be executed and not allow any others?
EDIT: I forgot to mention that I also am planning to provide player with information so that people can made code that would adapt to the situation. For example there will be parameter called vision that has vision.front and vision.left etc. These variables would just say if there is an enemy, wall, flower, etc around them in a grid. Some people suggested that I just replace some functions with key words but then it compromises the idea of using if statements and making it act differently.
EDIT 2: Sorry for lack of code in this post, but because of the way I am making it, half of the logic is written on server side and half of it works on client side. It will be a little large and to be completely honest I am not sure how readable my code is, still so far I am getting great help and I am very thankful for it. Thank you to everybody who is answering
Do NOT use eval() to execute arbitrary user input as code! There's no way to allow your code to run a function but prevent eval() from doing the same.
Instead, what you should do is make a map of commands the player can use, mapping them to functions. That way, you run the function based on the map lookup, but if it's not in the map, it can't be run. You can even allow arguments by splitting the string at spaces and spreading the array over the function parameters. Something like this:
const instructions = {
goLeft: player.goLeft.bind(player),
goRight: player.goRight.bind(player),
attackInFront: player.attackInFront.bind(player)
};
function processInstruction(instruction_string) {
const pieces = instruction_string.split(' ');
const command = pieces[0];
const args = pieces.slice(1);
if (instructions[command]) {
instructions[command](...args);
} else {
// Notify the user their command is not recognized.
}
};
With that, the player can enter things like goLeft 5 6 and it will call player.goLeft(5,6), but if they try to enter otherFunction 20 40 it will just say it's unrecognized, since otherFunction isn't in the map.
This issue sounds similar to the SQL Injection problem. I suggest you use a similar solution. Create an abstraction layer between the users input and your execution, similar to using parameters with stored procedures.
Let the users type keywords such as 'ATTACK FRONT', then pass that input to a function which parses the string, looks for keywords, then passes back 'player.attackInFront()' to be evaluated.
With this approach you simplify the syntax for the users, and limit the possible actions to those you allow.
I hope this isn't too vague. Good luck!
From your edit, it sounds like you're looking for an object-oriented approach to players. I'm not sure of your existing implementation needs, but it would look like this.
function Player() {
this.vision = {
left: '',
// and so on
}
}
Player.prototype.updateVisibilities = function() {
// to modify the values of this.visibility for each player
}
Player.prototype.moveLeft = function() {
}
Don't give the user an arbitrary interface (such as an input textfield that uses eval) to modify their attributes. Make a UI layer to control this logic. Things like buttons, inputs which explicitly run functions/methods that operate on the player. It shouldn't be up to the player as to what attributes they should have.

String logic in everyday chat using javascript

I'm sort of building an AI for a Telegram Bot, and currently I'm trying to process the text and respond to the user almost like a human does.
For example;
"I want to register"
As a human we understand that the user wants to register.
So I'd process this text using javascript's indexOf to look for want and register
var user_text = message.text;
if (user_text.indexOf('want') >= 0) {
if (user_text.indexOf('register') >= 0) {
console.log('He wants to register?')
}
}
But what if the text contains not somewhere in the string? Of course I'd have like a zillion of conditions for a zillion of cases. It'd be tiring to write this kind of logic.
My question is — Is there any other elegant way to do this? I don't really know the keyword to Google this...
The concept you're looking for is natural language processing and is a very broad field. Full NLP is very intricate and complicated, with all kinds of issues.
I would suggest starting with a much simpler solution, by splitting your input into words. You can do that using the String.prototype.split method with some tweaks. Filter out tokens you don't care about and don't contribute to the command, like "the", "a", "an". Take the remaining tokens, look for negation ("not", "don't") and keywords. You may need to combine adjacent tokens, if you have some two-word commands.
That could look something like:
var user_text = message.text;
var tokens = user_text.split(' '); // split on spaces, very simple "word boundary"
tokens = tokens.map(function (token) {
return token.toLowerCase();
});
var remove = ['the', 'a', 'an'];
tokens = tokens.filter(function (token) {
return remove.indexOf(token) === -1; // if remove array does *not* contain token
});
if (tokens.indexOf('register') !== -1) {
// User wants to register
} else if (tokens.indexOf('enable') !== -1) {
if (tokens.indexOf('not') !== -1) {
// User does not want to enable
} else {
// User does want to enable
}
}
This is not a full solution: you will eventually want to run the string through a real tokenizer and potentially even a full parser, and may want to employ a rule engine to simplify the logic.
If you can restrict the inputs you need to understand (a limited number of sentence forms and nouns/verbs), you can probably just use a simple parser with a few rules to handle most commands. Enforcing a predictable sentence structure with articles removed will make your life much easier.
You could also take the example above and replace the filter with a whitelist (only include words that are known). That would leave you with a small set of known tokens, but introduces the potential to strip useful words and misinterpret the command, so you should confirm with the user before running anything.
If you really want to parse and understand sentences expressed in natural language, you should look into the topic of natural language processing. This is usually done with some kind of neural network trained to "understand" different variations of sentences (aka machine learning), because specifying all of different syntactic and semantic rules of the language appears to be an overwhelming task.
If however the amount of variations of these sentences is limited, then you could specify some rules in the form of commonly used word combinations, probably even regular expressions would do in the simplest case.

What is dangerous in an HTML textarea?

I'm working on the profile section of my users.
They can define several things including their description ("About me") in a textarea, with a max of 400 characters.
In this description, I want to let my users use Font Awesome and Bootstrap icons. I also let them use JS tags (but not PHP ones). I guess this is pretty dangerous, therefore I wanted to know :
Is letting people use JS tags dangerous ? I know I must block functions like $.ajax but maybe there are somethings else.
Does a function which blocks string containing JS or PHP code exist in JS or jQuery ?
Is letting people use HTML tags and attributes dangerous for my site ?
Thank you !
As long as you escape all the tags before saving the form, I think it's all good.
You can do this with the following function:
function escapeTags(value){
return $('<div/>').text(value).html();
}
For eg. the following <script>alert("hello world")</script> will become <script>alert("hello world")</script>.
Also, you can do this with javascript only:
function htmlEntities(str) {
return String(str).replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"');
}
Source: https://css-tricks.com/snippets/javascript/htmlentities-for-javascript/
...if you take a look at comments you'll se that there's also a function that reverse my escapeTags function
// Encode/decode htmlentities
function krEncodeEntities(s){
return $j("<div/>").text(s).html();
}
function krDencodeEntities(s){
return $j("<div/>").html(s).text();
}
Yes, it's in fact very dangerous, and should only be allowed on a very limited set of "tags" or function calls.
Several systems like bbcode exist to address specifically these issues. i suggest implementing one of those.
They're easier to validate, and it is fairly easy to add new features to them.
It should be less work than validating actual js and php code and figuring out whether or not it is trying to do something malicious.

Strip GUID from field if present

MY client webservice returning some field with GUID and some not.
I want to write a function which will check if there is GUID in that field i need to strip off that GUID from that, If GUID not present then return field as it is.
Prototype of function:-
function stripGuid(field)
{
//check if field has GUID then strip off, return new strip field
//else return field as it is
}
Sample data:-
4922093F-148F-4220-B321-0FBB1843B5DDrec_guid
4922093F-148F-4220-B321-0FBB1843B5DDdate_add
tablenam
sessguid
How I call function:-
stripGuid(sampleData);
Expected Output:-
rec_guid
date_add
tablenam
sessguid
As with most things regex, the complexity of the answer depends on how many cases you need to cover, for example:
return field.replace(/[0-9a-fA-F\-]{36}/g, "");
will cover most cases and isn't too terrible to read, but it fails in some pretty important cases, so it might be that
return field.replace(/[0-9a-fA-F]{8}\-?[0-9a-fA-F]{4}\-?[0-9a-fA-F]{4}\-?[0-9a-fA-F]{4}\-?[0-9a-fA-F]{12}/g, "");
would work better for you (as it ensures that the dashes are all in the right place, and that there are the right number of them). The better option depends on how standardized you expect the input to be and where your project draws the line between readability and correctness. Without more details it's hard to say what would be best for you.
Edit: Nate Kerkhofs is right, I had left off the global flag on the above regexes, it's fixed now.
return field.replace(/[a-f0-9]{8}-(?:[a-f0-9]{4}-){3}[a-f0-9]{12}/gi, "");
Unlike teryret's solution, this removes all guids, and also ignores case for easier reading.

Asymmetrical search for special characters in JavaScript

I'm trying to implement an asymmetrical search for a dictionary web app, so searching for ü, for example, will return only tokens that actually contain ü, but searching for u will return both u and ü. (This is so users who don't know how to type special characters can still search for them, but users who do know how to type them won't be inundated with the plain character forms unnecessarily.)
It has to all be client-side JavaScript without any external libraries.
I've managed to make the second search type work by running both the search term and the text I'm searching through the following function, effectively merging special characters with their plain counterparts:
function cleanUp(dirty) {
cleaned = dirty.replace(/[áàâãäāă]/ig,"a");
cleaned = cleaned.replace(/đ/ig,"d");
cleaned = cleaned.replace(/[éèêẽëēĕ]/ig,"e");
cleaned = cleaned.replace(/[íìîĩïīĭ]/ig,"i");
cleaned = cleaned.replace(/ñ/ig,"n");
cleaned = cleaned.replace(/[óòôõöōŏ]/ig,"o");
cleaned = cleaned.replace(/[úùûũüūŭ]/ig,"u");
return cleaned;
}
I then compare the strings to get my results with something like:
var search_term = cleanup(search_input.value);
var text_to_search = cleanup(main_text);
if (text_to_search.indexOf(search_term) > -1) ... //do something
It's not elegant, but it works. After cleaning up both strings the user can search for i.e. uber and get über even if they don't know how to type ü. But if they do know how, searching for über directly also returns things like uber, which is what I don't want.
I've already thought of things like checking for each special character separately for each search term or duplicating every dictionary entry that has a special character to produce a special-character and a plain-character version, but all of my ideas would seriously slow down the processing time for the search.
Any ideas are greatly appreciated.
The answer you posted sounds quite reasonable.
I would just like to suggest a cleaner way (pun intended) to code your cleanup() function and similar functions that do a series of string operations:
function cleanUp(dirty) {
return dirty
.replace(/[áàâãäāă]/ig,"a")
.replace(/đ/ig,"d")
.replace(/[éèêẽëēĕ]/ig,"e")
.replace(/[íìîĩïīĭ]/ig,"i")
.replace(/ñ/ig,"n")
.replace(/[óòôõöōŏ]/ig,"o")
.replace(/[úùûũüūŭ]/ig,"u");
}
I ended up checking to see if the search term contained any special characters, and if it did, I didn't run it through cleanup(), and compared it to the original dictionary entry instead of the cleaned one. Thanks for the comments everyone.

Categories