extract URI from given text (facebook like URL detection) [duplicate] - javascript
Does anyone have suggestions for detecting URLs in a set of strings?
arrayOfStrings.forEach(function(string){
// detect URLs in strings and do something swell,
// like creating elements with links.
});
Update: I wound up using this regex for link detection… Apparently several years later.
kLINK_DETECTION_REGEX = /(([a-z]+:\/\/)?(([a-z0-9\-]+\.)+([a-z]{2}|aero|arpa|biz|com|coop|edu|gov|info|int|jobs|mil|museum|name|nato|net|org|pro|travel|local|internal))(:[0-9]{1,5})?(\/[a-z0-9_\-\.~]+)*(\/([a-z0-9_\-\.]*)(\?[a-z0-9+_\-\.%=&]*)?)?(#[a-zA-Z0-9!$&'()*+.=-_~:#/?]*)?)(\s+|$)/gi
The full helper (with optional Handlebars support) is at gist #1654670.
First you need a good regex that matches urls. This is hard to do. See here, here and here:
...almost anything is a valid URL. There
are some punctuation rules for
splitting it up. Absent any
punctuation, you still have a valid
URL.
Check the RFC carefully and see if you
can construct an "invalid" URL. The
rules are very flexible.
For example ::::: is a valid URL.
The path is ":::::". A pretty
stupid filename, but a valid filename.
Also, ///// is a valid URL. The
netloc ("hostname") is "". The path
is "///". Again, stupid. Also
valid. This URL normalizes to "///"
which is the equivalent.
Something like "bad://///worse/////"
is perfectly valid. Dumb but valid.
Anyway, this answer is not meant to give you the best regex but rather a proof of how to do the string wrapping inside the text, with JavaScript.
OK so lets just use this one: /(https?:\/\/[^\s]+)/g
Again, this is a bad regex. It will have many false positives. However it's good enough for this example.
function urlify(text) {
var urlRegex = /(https?:\/\/[^\s]+)/g;
return text.replace(urlRegex, function(url) {
return '' + url + '';
})
// or alternatively
// return text.replace(urlRegex, '$1')
}
var text = 'Find me at http://www.example.com and also at http://stackoverflow.com';
var html = urlify(text);
console.log(html)
// html now looks like:
// "Find me at http://www.example.com and also at http://stackoverflow.com"
So in sum try:
$$('#pad dl dd').each(function(element) {
element.innerHTML = urlify(element.innerHTML);
});
Here is what I ended up using as my regex:
var urlRegex =/(\b(https?|ftp|file):\/\/[-A-Z0-9+&##\/%?=~_|!:,.;]*[-A-Z0-9+&##\/%=~_|])/ig;
This doesn't include trailing punctuation in the URL. Crescent's function works like a charm :)
so:
function linkify(text) {
var urlRegex =/(\b(https?|ftp|file):\/\/[-A-Z0-9+&##\/%?=~_|!:,.;]*[-A-Z0-9+&##\/%=~_|])/ig;
return text.replace(urlRegex, function(url) {
return '' + url + '';
});
}
I googled this problem for quite a while, then it occurred to me that there is an Android method, android.text.util.Linkify, that utilizes some pretty robust regexes to accomplish this. Luckily, Android is open source.
They use a few different patterns for matching different types of urls. You can find them all here:
http://grepcode.com/file/repository.grepcode.com/java/ext/com.google.android/android/2.0_r1/android/text/util/Regex.java#Regex.0WEB_URL_PATTERN
If you're just concerned about url's that match the WEB_URL_PATTERN, that is, urls that conform to the RFC 1738 spec, you can use this:
/((?:(http|https|Http|Https|rtsp|Rtsp):\/\/(?:(?:[a-zA-Z0-9\$\-\_\.\+\!\*\'\(\)\,\;\?\&\=]|(?:\%[a-fA-F0-9]{2})){1,64}(?:\:(?:[a-zA-Z0-9\$\-\_\.\+\!\*\'\(\)\,\;\?\&\=]|(?:\%[a-fA-F0-9]{2})){1,25})?\#)?)?((?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]{0,64}\.)+(?:(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])|(?:biz|b[abdefghijmnorstvwyz])|(?:cat|com|coop|c[acdfghiklmnoruvxyz])|d[ejkmoz]|(?:edu|e[cegrstu])|f[ijkmor]|(?:gov|g[abdefghilmnpqrstuwy])|h[kmnrtu]|(?:info|int|i[delmnoqrst])|(?:jobs|j[emop])|k[eghimnrwyz]|l[abcikrstuvy]|(?:mil|mobi|museum|m[acdghklmnopqrstuvwxyz])|(?:name|net|n[acefgilopruz])|(?:org|om)|(?:pro|p[aefghklmnrstwy])|qa|r[eouw]|s[abcdeghijklmnortuvyz]|(?:tel|travel|t[cdfghjklmnoprtvwz])|u[agkmsyz]|v[aceginu]|w[fs]|y[etu]|z[amw]))|(?:(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[0-9])))(?:\:\d{1,5})?)(\/(?:(?:[a-zA-Z0-9\;\/\?\:\#\&\=\#\~\-\.\+\!\*\'\(\)\,\_])|(?:\%[a-fA-F0-9]{2}))*)?(?:\b|$)/gi;
Here is the full text of the source:
"((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
+ "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
+ "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\#)?)?"
+ "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host
+ "(?:" // plus top level domain
+ "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
+ "|(?:biz|b[abdefghijmnorstvwyz])"
+ "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])"
+ "|d[ejkmoz]"
+ "|(?:edu|e[cegrstu])"
+ "|f[ijkmor]"
+ "|(?:gov|g[abdefghilmnpqrstuwy])"
+ "|h[kmnrtu]"
+ "|(?:info|int|i[delmnoqrst])"
+ "|(?:jobs|j[emop])"
+ "|k[eghimnrwyz]"
+ "|l[abcikrstuvy]"
+ "|(?:mil|mobi|museum|m[acdghklmnopqrstuvwxyz])"
+ "|(?:name|net|n[acefgilopruz])"
+ "|(?:org|om)"
+ "|(?:pro|p[aefghklmnrstwy])"
+ "|qa"
+ "|r[eouw]"
+ "|s[abcdeghijklmnortuvyz]"
+ "|(?:tel|travel|t[cdfghjklmnoprtvwz])"
+ "|u[agkmsyz]"
+ "|v[aceginu]"
+ "|w[fs]"
+ "|y[etu]"
+ "|z[amw]))"
+ "|(?:(?:25[0-5]|2[0-4]" // or ip address
+ "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
+ "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
+ "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
+ "|[1-9][0-9]|[0-9])))"
+ "(?:\\:\\d{1,5})?)" // plus option port number
+ "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\#\\&\\=\\#\\~" // plus option query params
+ "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
+ "(?:\\b|$)";
If you want to be really fancy, you can test for email addresses as well. The regex for email addresses is:
/[a-zA-Z0-9\\+\\.\\_\\%\\-]{1,256}\\#[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}(\\.[a-zA-Z0-9][a-zA-Z0-9\\-]{0,25})+/gi
PS: The top level domains supported by above regex are current as of June 2007. For an up to date list you'll need to check https://data.iana.org/TLD/tlds-alpha-by-domain.txt.
Based on Crescent Fresh answer
if you want to detect links with http:// OR without http:// and by www. you can use the following
function urlify(text) {
var urlRegex = /(((https?:\/\/)|(www\.))[^\s]+)/g;
//var urlRegex = /(https?:\/\/[^\s]+)/g;
return text.replace(urlRegex, function(url,b,c) {
var url2 = (c == 'www.') ? 'http://' +url : url;
return '' + url + '';
})
}
This library on NPM looks like it is pretty comprehensive https://www.npmjs.com/package/linkifyjs
Linkify is a small yet comprehensive JavaScript plugin for finding URLs in plain-text and converting them to HTML links. It works with all valid URLs and email addresses.
Function can be further improved to render images as well:
function renderHTML(text) {
var rawText = strip(text)
var urlRegex =/(\b(https?|ftp|file):\/\/[-A-Z0-9+&##\/%?=~_|!:,.;]*[-A-Z0-9+&##\/%=~_|])/ig;
return rawText.replace(urlRegex, function(url) {
if ( ( url.indexOf(".jpg") > 0 ) || ( url.indexOf(".png") > 0 ) || ( url.indexOf(".gif") > 0 ) ) {
return '<img src="' + url + '">' + '<br/>'
} else {
return '' + url + '' + '<br/>'
}
})
}
or for a thumbnail image that links to fiull size image:
return '<img style="width: 100px; border: 0px; -moz-border-radius: 5px; border-radius: 5px;" src="' + url + '">' + '' + '<br/>'
And here is the strip() function that pre-processes the text string for uniformity by removing any existing html.
function strip(html)
{
var tmp = document.createElement("DIV");
tmp.innerHTML = html;
var urlRegex =/(\b(https?|ftp|file):\/\/[-A-Z0-9+&##\/%?=~_|!:,.;]*[-A-Z0-9+&##\/%=~_|])/ig;
return tmp.innerText.replace(urlRegex, function(url) {
return '\n' + url
})
}
There is existing npm package: url-regex, just install it with yarn add url-regex or npm install url-regex and use as following:
const urlRegex = require('url-regex');
const replaced = 'Find me at http://www.example.com and also at http://stackoverflow.com or at google.com'
.replace(urlRegex({strict: false}), function(url) {
return '' + url + '';
});
let str = 'https://example.com is a great site'
str.replace(/(https?:\/\/[^\s]+)/g,"<a href='$1' target='_blank' >$1</a>")
Short Code Big Work!...
Result:-
<a href="https://example.com" target="_blank" > https://example.com </a>
If you want to detect links with http:// OR without http:// OR ftp OR other possible cases like removing trailing punctuation at the end, take a look at this code.
https://jsfiddle.net/AndrewKang/xtfjn8g3/
A simple way to use that is to use NPM
npm install --save url-knife
Detect URLs in text and make clickable.
const detectURLInText = ( contentElement ) => {
const elem = document.querySelector(contentElement);
elem.innerHTML = elem.innerHTML.replace(/(https?:\/\/[^\s]+)/g, `<a class='link' href="$1">$1</a>`)
return elem
}
detectURLInText( '#myContent');
<div id="myContent">
Hell world!, detect URLs in text and make clickable.
IP: https://123.0.1.890:8080
Web: https://any-domain.com
</div>
try this:
function isUrl(s) {
if (!isUrl.rx_url) {
// taken from https://gist.github.com/dperini/729294
isUrl.rx_url=/^(?:(?:https?|ftp):\/\/)?(?:\S+(?::\S*)?#)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,}))\.?)(?::\d{2,5})?(?:[/?#]\S*)?$/i;
// valid prefixes
isUrl.prefixes=['http:\/\/', 'https:\/\/', 'ftp:\/\/', 'www.'];
// taken from https://w3techs.com/technologies/overview/top_level_domain/all
isUrl.domains=['com','ru','net','org','de','jp','uk','br','pl','in','it','fr','au','info','nl','ir','cn','es','cz','kr','ua','ca','eu','biz','za','gr','co','ro','se','tw','mx','vn','tr','ch','hu','at','be','dk','tv','me','ar','no','us','sk','xyz','fi','id','cl','by','nz','il','ie','pt','kz','io','my','lt','hk','cc','sg','edu','pk','su','bg','th','top','lv','hr','pe','club','rs','ae','az','si','ph','pro','ng','tk','ee','asia','mobi'];
}
if (!isUrl.rx_url.test(s)) return false;
for (let i=0; i<isUrl.prefixes.length; i++) if (s.startsWith(isUrl.prefixes[i])) return true;
for (let i=0; i<isUrl.domains.length; i++) if (s.endsWith('.'+isUrl.domains[i]) || s.includes('.'+isUrl.domains[i]+'\/') ||s.includes('.'+isUrl.domains[i]+'?')) return true;
return false;
}
function isEmail(s) {
if (!isEmail.rx_email) {
// taken from http://stackoverflow.com/a/16016476/460084
var sQtext = '[^\\x0d\\x22\\x5c\\x80-\\xff]';
var sDtext = '[^\\x0d\\x5b-\\x5d\\x80-\\xff]';
var sAtom = '[^\\x00-\\x20\\x22\\x28\\x29\\x2c\\x2e\\x3a-\\x3c\\x3e\\x40\\x5b-\\x5d\\x7f-\\xff]+';
var sQuotedPair = '\\x5c[\\x00-\\x7f]';
var sDomainLiteral = '\\x5b(' + sDtext + '|' + sQuotedPair + ')*\\x5d';
var sQuotedString = '\\x22(' + sQtext + '|' + sQuotedPair + ')*\\x22';
var sDomain_ref = sAtom;
var sSubDomain = '(' + sDomain_ref + '|' + sDomainLiteral + ')';
var sWord = '(' + sAtom + '|' + sQuotedString + ')';
var sDomain = sSubDomain + '(\\x2e' + sSubDomain + ')*';
var sLocalPart = sWord + '(\\x2e' + sWord + ')*';
var sAddrSpec = sLocalPart + '\\x40' + sDomain; // complete RFC822 email address spec
var sValidEmail = '^' + sAddrSpec + '$'; // as whole string
isEmail.rx_email = new RegExp(sValidEmail);
}
return isEmail.rx_email.test(s);
}
will also recognize urls such as google.com , http://www.google.bla , http://google.bla , www.google.bla but not google.bla
Generic Object Oriented Solution
For people like me that use frameworks like angular that don't allow manipulating DOM directly, I created a function that takes a string and returns an array of url/plainText objects that can be used to create any UI representation that you want.
URL regex
For URL matching I used (slightly adapted) h0mayun regex: /(?:(?:https?:\/\/)|(?:www\.))[^\s]+/g
My function also drops punctuation characters from the end of a URL like . and , that I believe more often will be actual punctuation than a legit URL ending (but it could be! This is not rigorous science as other answers explain well) For that I apply the following regex onto matched URLs /^(.+?)([.,?!'"]*)$/.
Typescript code
export function urlMatcherInText(inputString: string): UrlMatcherResult[] {
if (! inputString) return [];
const results: UrlMatcherResult[] = [];
function addText(text: string) {
if (! text) return;
const result = new UrlMatcherResult();
result.type = 'text';
result.value = text;
results.push(result);
}
function addUrl(url: string) {
if (! url) return;
const result = new UrlMatcherResult();
result.type = 'url';
result.value = url;
results.push(result);
}
const findUrlRegex = /(?:(?:https?:\/\/)|(?:www\.))[^\s]+/g;
const cleanUrlRegex = /^(.+?)([.,?!'"]*)$/;
let match: RegExpExecArray;
let indexOfStartOfString = 0;
do {
match = findUrlRegex.exec(inputString);
if (match) {
const text = inputString.substr(indexOfStartOfString, match.index - indexOfStartOfString);
addText(text);
var dirtyUrl = match[0];
var urlDirtyMatch = cleanUrlRegex.exec(dirtyUrl);
addUrl(urlDirtyMatch[1]);
addText(urlDirtyMatch[2]);
indexOfStartOfString = match.index + dirtyUrl.length;
}
}
while (match);
const remainingText = inputString.substr(indexOfStartOfString, inputString.length - indexOfStartOfString);
addText(remainingText);
return results;
}
export class UrlMatcherResult {
public type: 'url' | 'text'
public value: string
}
Here is a little solution for react app without using any library please note that this method work if the url is not attached to any character
this component will return a paragraph with kink detection !
import React from "react";
interface Props {
paragraph: string,
}
const REGEX = /^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$/gm;
const Paragraph: React.FC<Props> = ({ paragraph }) => {
const paragraphArray = paragraph.split(' ');
return <div>
{
paragraphArray.map((word: any) => {
return word.match(REGEX) ? (
<>
{word} {' '}
</>
) : word + ' '
})
}
</div>;
};
export default LinkParaGraph;
tmp.innerText is undefined. You should use tmp.innerHTML
function strip(html)
{
var tmp = document.createElement("DIV");
tmp.innerHTML = html;
var urlRegex =/(\b(https?|ftp|file):\/\/[-A-Z0-9+&##\/%?=~_|!:,.;]*[-A-Z0-9+&##\/%=~_|])/ig;
return tmp.innerHTML .replace(urlRegex, function(url) {
return '\n' + url
})
You can use a regex like this to extract normal url patterns.
(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})
If you need more sophisticated patterns, use a library like this.
https://www.npmjs.com/package/pattern-dreamer
Related
javascript retrieve all links from text [duplicate]
Does anyone have suggestions for detecting URLs in a set of strings? arrayOfStrings.forEach(function(string){ // detect URLs in strings and do something swell, // like creating elements with links. }); Update: I wound up using this regex for link detection… Apparently several years later. kLINK_DETECTION_REGEX = /(([a-z]+:\/\/)?(([a-z0-9\-]+\.)+([a-z]{2}|aero|arpa|biz|com|coop|edu|gov|info|int|jobs|mil|museum|name|nato|net|org|pro|travel|local|internal))(:[0-9]{1,5})?(\/[a-z0-9_\-\.~]+)*(\/([a-z0-9_\-\.]*)(\?[a-z0-9+_\-\.%=&]*)?)?(#[a-zA-Z0-9!$&'()*+.=-_~:#/?]*)?)(\s+|$)/gi The full helper (with optional Handlebars support) is at gist #1654670.
First you need a good regex that matches urls. This is hard to do. See here, here and here: ...almost anything is a valid URL. There are some punctuation rules for splitting it up. Absent any punctuation, you still have a valid URL. Check the RFC carefully and see if you can construct an "invalid" URL. The rules are very flexible. For example ::::: is a valid URL. The path is ":::::". A pretty stupid filename, but a valid filename. Also, ///// is a valid URL. The netloc ("hostname") is "". The path is "///". Again, stupid. Also valid. This URL normalizes to "///" which is the equivalent. Something like "bad://///worse/////" is perfectly valid. Dumb but valid. Anyway, this answer is not meant to give you the best regex but rather a proof of how to do the string wrapping inside the text, with JavaScript. OK so lets just use this one: /(https?:\/\/[^\s]+)/g Again, this is a bad regex. It will have many false positives. However it's good enough for this example. function urlify(text) { var urlRegex = /(https?:\/\/[^\s]+)/g; return text.replace(urlRegex, function(url) { return '' + url + ''; }) // or alternatively // return text.replace(urlRegex, '$1') } var text = 'Find me at http://www.example.com and also at http://stackoverflow.com'; var html = urlify(text); console.log(html) // html now looks like: // "Find me at http://www.example.com and also at http://stackoverflow.com" So in sum try: $$('#pad dl dd').each(function(element) { element.innerHTML = urlify(element.innerHTML); });
Here is what I ended up using as my regex: var urlRegex =/(\b(https?|ftp|file):\/\/[-A-Z0-9+&##\/%?=~_|!:,.;]*[-A-Z0-9+&##\/%=~_|])/ig; This doesn't include trailing punctuation in the URL. Crescent's function works like a charm :) so: function linkify(text) { var urlRegex =/(\b(https?|ftp|file):\/\/[-A-Z0-9+&##\/%?=~_|!:,.;]*[-A-Z0-9+&##\/%=~_|])/ig; return text.replace(urlRegex, function(url) { return '' + url + ''; }); }
I googled this problem for quite a while, then it occurred to me that there is an Android method, android.text.util.Linkify, that utilizes some pretty robust regexes to accomplish this. Luckily, Android is open source. They use a few different patterns for matching different types of urls. You can find them all here: http://grepcode.com/file/repository.grepcode.com/java/ext/com.google.android/android/2.0_r1/android/text/util/Regex.java#Regex.0WEB_URL_PATTERN If you're just concerned about url's that match the WEB_URL_PATTERN, that is, urls that conform to the RFC 1738 spec, you can use this: /((?:(http|https|Http|Https|rtsp|Rtsp):\/\/(?:(?:[a-zA-Z0-9\$\-\_\.\+\!\*\'\(\)\,\;\?\&\=]|(?:\%[a-fA-F0-9]{2})){1,64}(?:\:(?:[a-zA-Z0-9\$\-\_\.\+\!\*\'\(\)\,\;\?\&\=]|(?:\%[a-fA-F0-9]{2})){1,25})?\#)?)?((?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]{0,64}\.)+(?:(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])|(?:biz|b[abdefghijmnorstvwyz])|(?:cat|com|coop|c[acdfghiklmnoruvxyz])|d[ejkmoz]|(?:edu|e[cegrstu])|f[ijkmor]|(?:gov|g[abdefghilmnpqrstuwy])|h[kmnrtu]|(?:info|int|i[delmnoqrst])|(?:jobs|j[emop])|k[eghimnrwyz]|l[abcikrstuvy]|(?:mil|mobi|museum|m[acdghklmnopqrstuvwxyz])|(?:name|net|n[acefgilopruz])|(?:org|om)|(?:pro|p[aefghklmnrstwy])|qa|r[eouw]|s[abcdeghijklmnortuvyz]|(?:tel|travel|t[cdfghjklmnoprtvwz])|u[agkmsyz]|v[aceginu]|w[fs]|y[etu]|z[amw]))|(?:(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[0-9])))(?:\:\d{1,5})?)(\/(?:(?:[a-zA-Z0-9\;\/\?\:\#\&\=\#\~\-\.\+\!\*\'\(\)\,\_])|(?:\%[a-fA-F0-9]{2}))*)?(?:\b|$)/gi; Here is the full text of the source: "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\#)?)?" + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host + "(?:" // plus top level domain + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])" + "|(?:biz|b[abdefghijmnorstvwyz])" + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])" + "|d[ejkmoz]" + "|(?:edu|e[cegrstu])" + "|f[ijkmor]" + "|(?:gov|g[abdefghilmnpqrstuwy])" + "|h[kmnrtu]" + "|(?:info|int|i[delmnoqrst])" + "|(?:jobs|j[emop])" + "|k[eghimnrwyz]" + "|l[abcikrstuvy]" + "|(?:mil|mobi|museum|m[acdghklmnopqrstuvwxyz])" + "|(?:name|net|n[acefgilopruz])" + "|(?:org|om)" + "|(?:pro|p[aefghklmnrstwy])" + "|qa" + "|r[eouw]" + "|s[abcdeghijklmnortuvyz]" + "|(?:tel|travel|t[cdfghjklmnoprtvwz])" + "|u[agkmsyz]" + "|v[aceginu]" + "|w[fs]" + "|y[etu]" + "|z[amw]))" + "|(?:(?:25[0-5]|2[0-4]" // or ip address + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" + "|[1-9][0-9]|[0-9])))" + "(?:\\:\\d{1,5})?)" // plus option port number + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\#\\&\\=\\#\\~" // plus option query params + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" + "(?:\\b|$)"; If you want to be really fancy, you can test for email addresses as well. The regex for email addresses is: /[a-zA-Z0-9\\+\\.\\_\\%\\-]{1,256}\\#[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}(\\.[a-zA-Z0-9][a-zA-Z0-9\\-]{0,25})+/gi PS: The top level domains supported by above regex are current as of June 2007. For an up to date list you'll need to check https://data.iana.org/TLD/tlds-alpha-by-domain.txt.
Based on Crescent Fresh answer if you want to detect links with http:// OR without http:// and by www. you can use the following function urlify(text) { var urlRegex = /(((https?:\/\/)|(www\.))[^\s]+)/g; //var urlRegex = /(https?:\/\/[^\s]+)/g; return text.replace(urlRegex, function(url,b,c) { var url2 = (c == 'www.') ? 'http://' +url : url; return '' + url + ''; }) }
This library on NPM looks like it is pretty comprehensive https://www.npmjs.com/package/linkifyjs Linkify is a small yet comprehensive JavaScript plugin for finding URLs in plain-text and converting them to HTML links. It works with all valid URLs and email addresses.
Function can be further improved to render images as well: function renderHTML(text) { var rawText = strip(text) var urlRegex =/(\b(https?|ftp|file):\/\/[-A-Z0-9+&##\/%?=~_|!:,.;]*[-A-Z0-9+&##\/%=~_|])/ig; return rawText.replace(urlRegex, function(url) { if ( ( url.indexOf(".jpg") > 0 ) || ( url.indexOf(".png") > 0 ) || ( url.indexOf(".gif") > 0 ) ) { return '<img src="' + url + '">' + '<br/>' } else { return '' + url + '' + '<br/>' } }) } or for a thumbnail image that links to fiull size image: return '<img style="width: 100px; border: 0px; -moz-border-radius: 5px; border-radius: 5px;" src="' + url + '">' + '' + '<br/>' And here is the strip() function that pre-processes the text string for uniformity by removing any existing html. function strip(html) { var tmp = document.createElement("DIV"); tmp.innerHTML = html; var urlRegex =/(\b(https?|ftp|file):\/\/[-A-Z0-9+&##\/%?=~_|!:,.;]*[-A-Z0-9+&##\/%=~_|])/ig; return tmp.innerText.replace(urlRegex, function(url) { return '\n' + url }) }
There is existing npm package: url-regex, just install it with yarn add url-regex or npm install url-regex and use as following: const urlRegex = require('url-regex'); const replaced = 'Find me at http://www.example.com and also at http://stackoverflow.com or at google.com' .replace(urlRegex({strict: false}), function(url) { return '' + url + ''; });
let str = 'https://example.com is a great site' str.replace(/(https?:\/\/[^\s]+)/g,"<a href='$1' target='_blank' >$1</a>") Short Code Big Work!... Result:- <a href="https://example.com" target="_blank" > https://example.com </a>
If you want to detect links with http:// OR without http:// OR ftp OR other possible cases like removing trailing punctuation at the end, take a look at this code. https://jsfiddle.net/AndrewKang/xtfjn8g3/ A simple way to use that is to use NPM npm install --save url-knife
Detect URLs in text and make clickable. const detectURLInText = ( contentElement ) => { const elem = document.querySelector(contentElement); elem.innerHTML = elem.innerHTML.replace(/(https?:\/\/[^\s]+)/g, `<a class='link' href="$1">$1</a>`) return elem } detectURLInText( '#myContent'); <div id="myContent"> Hell world!, detect URLs in text and make clickable. IP: https://123.0.1.890:8080 Web: https://any-domain.com </div>
try this: function isUrl(s) { if (!isUrl.rx_url) { // taken from https://gist.github.com/dperini/729294 isUrl.rx_url=/^(?:(?:https?|ftp):\/\/)?(?:\S+(?::\S*)?#)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,}))\.?)(?::\d{2,5})?(?:[/?#]\S*)?$/i; // valid prefixes isUrl.prefixes=['http:\/\/', 'https:\/\/', 'ftp:\/\/', 'www.']; // taken from https://w3techs.com/technologies/overview/top_level_domain/all isUrl.domains=['com','ru','net','org','de','jp','uk','br','pl','in','it','fr','au','info','nl','ir','cn','es','cz','kr','ua','ca','eu','biz','za','gr','co','ro','se','tw','mx','vn','tr','ch','hu','at','be','dk','tv','me','ar','no','us','sk','xyz','fi','id','cl','by','nz','il','ie','pt','kz','io','my','lt','hk','cc','sg','edu','pk','su','bg','th','top','lv','hr','pe','club','rs','ae','az','si','ph','pro','ng','tk','ee','asia','mobi']; } if (!isUrl.rx_url.test(s)) return false; for (let i=0; i<isUrl.prefixes.length; i++) if (s.startsWith(isUrl.prefixes[i])) return true; for (let i=0; i<isUrl.domains.length; i++) if (s.endsWith('.'+isUrl.domains[i]) || s.includes('.'+isUrl.domains[i]+'\/') ||s.includes('.'+isUrl.domains[i]+'?')) return true; return false; } function isEmail(s) { if (!isEmail.rx_email) { // taken from http://stackoverflow.com/a/16016476/460084 var sQtext = '[^\\x0d\\x22\\x5c\\x80-\\xff]'; var sDtext = '[^\\x0d\\x5b-\\x5d\\x80-\\xff]'; var sAtom = '[^\\x00-\\x20\\x22\\x28\\x29\\x2c\\x2e\\x3a-\\x3c\\x3e\\x40\\x5b-\\x5d\\x7f-\\xff]+'; var sQuotedPair = '\\x5c[\\x00-\\x7f]'; var sDomainLiteral = '\\x5b(' + sDtext + '|' + sQuotedPair + ')*\\x5d'; var sQuotedString = '\\x22(' + sQtext + '|' + sQuotedPair + ')*\\x22'; var sDomain_ref = sAtom; var sSubDomain = '(' + sDomain_ref + '|' + sDomainLiteral + ')'; var sWord = '(' + sAtom + '|' + sQuotedString + ')'; var sDomain = sSubDomain + '(\\x2e' + sSubDomain + ')*'; var sLocalPart = sWord + '(\\x2e' + sWord + ')*'; var sAddrSpec = sLocalPart + '\\x40' + sDomain; // complete RFC822 email address spec var sValidEmail = '^' + sAddrSpec + '$'; // as whole string isEmail.rx_email = new RegExp(sValidEmail); } return isEmail.rx_email.test(s); } will also recognize urls such as google.com , http://www.google.bla , http://google.bla , www.google.bla but not google.bla
Generic Object Oriented Solution For people like me that use frameworks like angular that don't allow manipulating DOM directly, I created a function that takes a string and returns an array of url/plainText objects that can be used to create any UI representation that you want. URL regex For URL matching I used (slightly adapted) h0mayun regex: /(?:(?:https?:\/\/)|(?:www\.))[^\s]+/g My function also drops punctuation characters from the end of a URL like . and , that I believe more often will be actual punctuation than a legit URL ending (but it could be! This is not rigorous science as other answers explain well) For that I apply the following regex onto matched URLs /^(.+?)([.,?!'"]*)$/. Typescript code export function urlMatcherInText(inputString: string): UrlMatcherResult[] { if (! inputString) return []; const results: UrlMatcherResult[] = []; function addText(text: string) { if (! text) return; const result = new UrlMatcherResult(); result.type = 'text'; result.value = text; results.push(result); } function addUrl(url: string) { if (! url) return; const result = new UrlMatcherResult(); result.type = 'url'; result.value = url; results.push(result); } const findUrlRegex = /(?:(?:https?:\/\/)|(?:www\.))[^\s]+/g; const cleanUrlRegex = /^(.+?)([.,?!'"]*)$/; let match: RegExpExecArray; let indexOfStartOfString = 0; do { match = findUrlRegex.exec(inputString); if (match) { const text = inputString.substr(indexOfStartOfString, match.index - indexOfStartOfString); addText(text); var dirtyUrl = match[0]; var urlDirtyMatch = cleanUrlRegex.exec(dirtyUrl); addUrl(urlDirtyMatch[1]); addText(urlDirtyMatch[2]); indexOfStartOfString = match.index + dirtyUrl.length; } } while (match); const remainingText = inputString.substr(indexOfStartOfString, inputString.length - indexOfStartOfString); addText(remainingText); return results; } export class UrlMatcherResult { public type: 'url' | 'text' public value: string }
Here is a little solution for react app without using any library please note that this method work if the url is not attached to any character this component will return a paragraph with kink detection ! import React from "react"; interface Props { paragraph: string, } const REGEX = /^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$/gm; const Paragraph: React.FC<Props> = ({ paragraph }) => { const paragraphArray = paragraph.split(' '); return <div> { paragraphArray.map((word: any) => { return word.match(REGEX) ? ( <> {word} {' '} </> ) : word + ' ' }) } </div>; }; export default LinkParaGraph;
tmp.innerText is undefined. You should use tmp.innerHTML function strip(html) { var tmp = document.createElement("DIV"); tmp.innerHTML = html; var urlRegex =/(\b(https?|ftp|file):\/\/[-A-Z0-9+&##\/%?=~_|!:,.;]*[-A-Z0-9+&##\/%=~_|])/ig; return tmp.innerHTML .replace(urlRegex, function(url) { return '\n' + url })
You can use a regex like this to extract normal url patterns. (https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}) If you need more sophisticated patterns, use a library like this. https://www.npmjs.com/package/pattern-dreamer
JS/JQUERY: How to match and replace occurances inside specified strings?
I swear i tried figuring this out myself all day, but my regex-foo is just not that good. I'm trying to create a small parser function to convert strings with urls to html coded and tags I know how complex a regex can be trying to figure out which urls to covert to what from a big string, so what I did is simply prefix the string to covert with a flag to tell the parser how to format it, and post fix it with the ";" char to tell the parser where that particular URL ends. This way the parser has lesser guest work to do resulting in easier to regex-match and faster for execution. I really dont need a generalize match and replace all. So my formatting is as follows, where "X" is the url string: For URLs it will be url=X; For IMAGES it will be img=X; so anything in between my prefix and post fix must be converted accordingly.. So for example, for images in my document, the string could be: click this image img=http://example.com/image1.jpg; and i need that converted to click this image <a href="http://example.com/image1.jpg" target="_blank"> <img class="img img-responsive" src="http://example.com/image1.jpg"/></a> I am able to do this easily in PHP buy preg_match() function preg_match('/\img=(.+?)\;/i', $item_des, $matches) here's the code block: I decided to push this routine to the browser instead of the backend (PHP) so i need similar or better JS solution. Hoping anyone can help here, thanks!
try code below: var str = "click this image img=http://example.com/image1.jpg;image2 img=http://example.com/image2.jpg;" var phrases = str.split(';'); var totalRes = ''; phrases.forEach(function(str){ totalRes += processPhrase(str); }); console.log(totalRes); function processPhrase(str) { var img = str.split('img=') var res = ''; if (img.length > 1) { //img=X var url = img[1].replace(';', ''); res = img[0] + "<a href='" + url + "' target='_blank'><img src='" + url + "'/></a>"; } else { var url = str.split('url='); //Do for url=X here } console.info(res); return res; }
You can use this regexp /(img|url)=(.+?);/g: (img|url) : the type, should be grouped so we will know what to do with the value = : literal "=" (.+?) : a number of characters (use the non-greedy ? so it will match as fewer as possible) ; : literal ";" Read more about non-greedy regexps here. Example: var str = "click this image img=http://i.imgur.com/3wY30O4.jpg?a=123&b=456; and check this URL url=http://google.com/;. Bye!"; // executer is an object that has functions that apply the changes for each type (you can modify the functions for your need) var executer = { "url": function(e) { return '<a target="_blank" href="' + e + '">' + e + '</a>'; }, "img": function(e) { return '<a target="_blank" href="' + e + '"><img src="' + e + '"/></a>'; } } var res = str.replace(/(img|url)=(.+?);/g, function(m, type, value) { return executer[type](value); // executer[type] will be either executer.url or executer.img, then we pass the value to that function and return its returned value }); console.log(res);
Regex mapping in Firefox context menu contentScript
I am developing context menu add-on for Firefox. I am trying to get the selectedText and validate if it is a number. If it is a number i am using that number value to process further. But, i got stuck at a point where i am trying to replace [(,)] using regex in javascript replace method. Following is the code which fails to map any number starting/ending with ( or ): var menuItemLRG = contextMenu.Item({ label: "LRG", data: "http://myurl/&val=:", contentScript: 'self.on("click", function (node, data) {' + ' var selectedText = window.getSelection().toString();' + ' var formattedText1 = selectedText.trim();' + ' var formattedText2 = formattedText1.replace(/^[,\[\]()]*/g,"");' + ' var formattedText3 = formattedText2.replace(/[,\[\]()]*$/g,"");' + ' console.log(formattedText3); '+ ' var regExp = new RegExp(/^[0-9]+$/);' + ' if (regExp.test(formattedText3) == true) {' + ' console.log("URL to follow :"+data+formattedText3);' + ' window.open(data+formattedText3);' + ' } '+ '});' }); Above code fails to replace ( or ) in sample inputs: (5663812, 11620033). But, a vanilla test like the following succeeds: <script> var str = "(2342423,])"; var tmpVal1 = str.replace(/^[,\[\]()]*/g,""); var tmpVal2 = tmpVal1.replace(/[,\[\]()]*$/g,""); var regExp = new RegExp(/^[0-9]+$/); if (regExp.test(tmpVal2) == true) { alert(tmpVal2); } </script>
After many trial and error found the issue. When we try to escape a character inside a single quotes we need to add one more escape for the escape character to get recognized, otherwise the single escape \] will be considered as ] which leads to abrupt ending of of the regex pattern. In this case: ' var formattedText2 = formattedText1.replace(/^[,\[\]()]*/g,"");' is decoded as : var formattedText2 = formattedText1.replace(/^[,[]()]*/g,""); instead of as: var formattedText2 = formattedText1.replace(/^[,\[\]()]*/g,""); So, by adding one more escape character for an escape character resolved the pattern correctly: ' var formattedText2 = formattedText1.replace(/^[,\\[\\]()]*/g,"");' Sorry for wasting your time in analyzing the cause, if any.
Deal with query strings in JS, like http_build_query, etc
Here's what I have if(condition1) { location.href = location.href+'/?site_type=normal'; } else if(condition2) { location.href = location.href+'/?site_type=other'; } Of course, if the location.href already has query vars on it, thats a problem, etc. I need to Find the vars from the query string if site_type already exists, replace the value with either 'normal' or 'other' rebuild the url with the new site_type edit: I found I needed to account for all kinds of URLs: domain.com domain.com/path/to/sth/ domain.com/?site_type=normal domain.com?var=123&foo=987 domain.com/path/?site_type=normal&var=123&foo=987 So, here's what I came up with, suggestions welcome: var searchstring = window.location.search; var url = window.location.href; console.log('search: ' + searchstring); console.log( 'url: ' + url); // strip search from url url = url.replace(searchstring,""); console.log( 'url: ' + url); //strip site_type from search searchstring = searchstring.replace("&site_type=normal","") .replace("&site_type=other","") .replace("?site_type=normal","") .replace("?site_type=other","") .replace("?","") ; console.log('search: ' + searchstring); if(searchstring != ''){searchstring = '&' + searchstring;} var final = url + '?site_type=normal' + searchstring; final = final.replace("&&","&"); console.log('final: ' + final);
You can directly access the query string with window.location.search. You can convert it to an object using this regex trick found here. var queryString = {}; window.location.search.replace(/([^?=&]+)(=([^&]*))?/g, function($0, $1, $2, $3) { queryString[$1] = $3; } ); Then set the site_type on queryString appropriately. queryString["site_type"] = "normal"; And finally, convert it back into a string and set that as the window.location.search. var searchString = ""; for ( q in queryString ) { searchString+="&" + q + "=" + queryString[q]; } window.location.search = searchString;
Here's a way to do this: //remove existing param and append new one.. var newHref = window.location.href.replace(window.location.search,"") + '?site_type=other'; //change href window.location.href = newHref; works only if you have one parameter that you want to replace, otherwise it would remove all parameters.
for example if you have yourpage.com/?site_type=normal and you need only website not query vars you cans clear them var novars= location.href.replace(window.location.search,"") this case novars = youroage.com for just getting variables u can do this: var site_type = window.location.search.replace("?site_type=",""); here i will get site_type value whether its normal or other this case your variable site_type = "normal" for rebuilding url u can just add new site_type location.href = novars+"?site_type=normal" or location.href = novars+"?site_type=other"
Adding/Modify query string / GET variables in a url with javascript
So I am wanting to replace GET variable values in a url and if the variable does not exist, then add it to the url. EDIT: I am doing this to a elements href not the pages current location.. I am not good with javascript but I do know how to use jQuery quite well and the basics of javascript. I do know how to write regex but not how to use the javascript syntax of regex and what functions to use it with. Here is what I have so far and it does have an error on line 3: See it on jsfiddle(or below): http://jsfiddle.net/MadLittleMods/C93mD/ function addParameter(url, param, value) { var pattern = new RegExp(param + '=(.*?);', 'gi'); return url.replace(pattern, param + '=' + value + ';'); alert(url); }
No need to use jQuery on this one. Regular Expressions and string functions are sufficient. See my commented code below: function addParameter(url, param, value) { // Using a positive lookahead (?=\=) to find the // given parameter, preceded by a ? or &, and followed // by a = with a value after than (using a non-greedy selector) // and then followed by a & or the end of the string var val = new RegExp('(\\?|\\&)' + param + '=.*?(?=(&|$))'), parts = url.toString().split('#'), url = parts[0], hash = parts[1] qstring = /\?.+$/, newURL = url; // Check if the parameter exists if (val.test(url)) { // if it does, replace it, using the captured group // to determine & or ? at the beginning newURL = url.replace(val, '$1' + param + '=' + value); } else if (qstring.test(url)) { // otherwise, if there is a query string at all // add the param to the end of it newURL = url + '&' + param + '=' + value; } else { // if there's no query string, add one newURL = url + '?' + param + '=' + value; } if (hash) { newURL += '#' + hash; } return newURL; } And here is the Fiddle Update: The code now handles the case where there is a hash on the URL. Edit Missed a case! The code now checks to see if there is a query string at all.
I would go with this small but complete library to handle urls in js: https://github.com/Mikhus/jsurl
See Change URL parameters. It answers your question in a more general manner (changing any url parameter). There are solutions for both jQuery and regular js in the answers section. It also looks like url.replace should be location.replace but I may be wrong (that statement's based on a quick google search for 'url.replace javascript').
<script type="text/javascript"> $(document).ready(function () { $('input.letter').click(function () { //0- prepare values var qsTargeted = 'letter=' + this.value; //"letter=A"; var windowUrl = ''; var qskey = qsTargeted.split('=')[0]; var qsvalue = qsTargeted.split('=')[1]; //1- get row url var originalURL = window.location.href; //2- get query string part, and url if (originalURL.split('?').length > 1) //qs is exists { windowUrl = originalURL.split('?')[0]; var qs = originalURL.split('?')[1]; //3- get list of query strings var qsArray = qs.split('&'); var flag = false; //4- try to find query string key for (var i = 0; i < qsArray.length; i++) { if (qsArray[i].split('=').length > 0) { if (qskey == qsArray[i].split('=')[0]) { //exists key qsArray[i] = qskey + '=' + qsvalue; flag = true; break; } } } if (!flag)// //5- if exists modify,else add { qsArray.push(qsTargeted); } var finalQs = qsArray.join('&'); //6- prepare final url window.location = windowUrl + '?' + finalQs; } else { //6- prepare final url //add query string window.location = originalURL + '?' + qsTargeted; } }) }); </script>