<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>The Official Blog of BigML.com</title>
	<atom:link href="http://blog.bigml.com/feed/" rel="self" type="application/rss+xml" />
	<link>http://blog.bigml.com</link>
	<description>Machine Learning Made Simple</description>
	<lastBuildDate>Mon, 17 Jun 2013 17:19:56 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='blog.bigml.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://s2.wp.com/i/buttonw-com.png</url>
		<title>The Official Blog of BigML.com</title>
		<link>http://blog.bigml.com</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://blog.bigml.com/osd.xml" title="The Official Blog of BigML.com" />
	<atom:link rel='hub' href='http://blog.bigml.com/?pushpress=hub'/>
		<item>
		<title>Giving Your Data the Machine Learning Treatment</title>
		<link>http://blog.bigml.com/2013/06/17/giving-your-data-the-machine-learning-treatment/</link>
		<comments>http://blog.bigml.com/2013/06/17/giving-your-data-the-machine-learning-treatment/#comments</comments>
		<pubDate>Mon, 17 Jun 2013 17:19:56 +0000</pubDate>
		<dc:creator>charleslparker</dc:creator>
				<category><![CDATA[Tutorial]]></category>
		<category><![CDATA[Video]]></category>

		<guid isPermaLink="false">http://blog.bigml.com/?p=4491</guid>
		<description><![CDATA[We&#8217;ve committed and recommitted ourselves at BigML to providing non-experts with a tool that allows them to do effective machine learning quickly. As we&#8217;ve worked towards this goal, we&#8217;ve heard from many people who want to do apply machine learning to their data, but just aren&#8217;t sure how to massage their problem into the standard [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=4491&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p style="text-align:justify;">We&#8217;ve committed and recommitted ourselves at BigML to providing non-experts with a tool that allows them to do effective machine learning quickly. As we&#8217;ve worked towards this goal, we&#8217;ve heard from many people who want to do apply machine learning to their data, but just aren&#8217;t sure how to massage their problem into the standard machine learning format of data to model to prediction.  We&#8217;ve decided to create a series of videos that takes you from end to end in the machine learning process, using instructive examples along the way.</p>
<p style="text-align:justify;"><a title="Parker in Action" href="http://www.youtube.com/watch?v=hV8lAjIVxiY"><img class="aligncenter size-large wp-image-4512" alt="charlie_parker" src="http://littleml.files.wordpress.com/2013/06/charlie_parker.png?w=497&#038;h=244" width="497" height="244" /></a></p>
<p style="text-align:justify;"><a title="BigML - Data Preparation" href="http://www.youtube.com/watch?v=hV8lAjIVxiY">Here&#8217;s the first one</a>, that shows how to go from data concept to BigML dataset.  We hope you find it useful in your own machine learning explorations!</p>
<p style="text-align:justify;"><span class='embed-youtube' style='text-align:center; display: block;'><iframe class='youtube-player' type='text/html' width='497' height='310' src='http://www.youtube.com/embed/hV8lAjIVxiY?version=3&#038;rel=1&#038;fs=1&#038;showsearch=0&#038;showinfo=1&#038;iv_load_policy=1&#038;wmode=transparent' frameborder='0'></iframe></span></p>
<p style="text-align:justify;">
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/littleml.wordpress.com/4491/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/littleml.wordpress.com/4491/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=4491&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.bigml.com/2013/06/17/giving-your-data-the-machine-learning-treatment/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/7dd9c63ce59f4ee73bbee56cdc44fe8d?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">charleslparker</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/06/charlie_parker.png?w=497" medium="image">
			<media:title type="html">charlie_parker</media:title>
		</media:content>
	</item>
		<item>
		<title>Matter over Mind in Machine Learning</title>
		<link>http://blog.bigml.com/2013/06/13/matter-over-mind-in-machine-learning/</link>
		<comments>http://blog.bigml.com/2013/06/13/matter-over-mind-in-machine-learning/#comments</comments>
		<pubDate>Thu, 13 Jun 2013 17:39:58 +0000</pubDate>
		<dc:creator>charleslparker</dc:creator>
				<category><![CDATA[Data]]></category>
		<category><![CDATA[Machine Learning]]></category>
		<category><![CDATA[machine learning]]></category>
		<category><![CDATA[relevance]]></category>
		<category><![CDATA[society]]></category>
		<category><![CDATA[Wagstaff]]></category>

		<guid isPermaLink="false">http://blog.bigml.com/?p=4486</guid>
		<description><![CDATA[I am fortunate enough to have had a number of conversations with Dr. Kiri Wagstaff of NASA&#8217;s JPL on a number of occasions (you might as well get the jokes about &#8220;not having to be a rocket scientist to understand machine learning&#8221; out of the way right now). Wagstaff is a brilliant scientist.  On top [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=4486&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p style="text-align:justify;">I am fortunate enough to have had a number of conversations with <a title="Kiri Wagstaff" href="http://www.wkiri.com/" target="_blank">Dr. Kiri Wagstaff of NASA&#8217;s JPL</a> on a number of occasions (you might as well get the jokes about &#8220;not having to be a rocket scientist to understand machine learning&#8221; out of the way right now).</p>
<p style="text-align:justify;"><a href="http://littleml.files.wordpress.com/2013/06/mind.jpg"><img class="aligncenter size-full wp-image-4496" alt="Matter over Mind in Machine Learning" src="http://littleml.files.wordpress.com/2013/06/mind.jpg?w=497&#038;h=452" width="497" height="452" /></a></p>
<p style="text-align:justify;">Wagstaff is a brilliant scientist.  On top of that, and fortunately for all of us, she works very &#8220;close to the data&#8221;, using her machine learning expertise to solve important problems that directly impact people other than machine learning researchers.  This closeness to the data is somewhat rare among machine learning experts but is becoming more and more common.  Our own chief scientist Tom Dietterich is a pioneer in <a title="Computational Sustainability" href="http://www.cis.cornell.edu/ics/" target="_blank">computation sustainability</a> and <a title="Ecosystem Informatics" href="http://web.engr.oregonstate.edu/~tgd/publications/ijcai2009-dietterich-abstract.pdf" target="_blank">ecosystem informatics</a>.  Another acquaintance of mine, Rayid Ghani, left a lucrative position at Accenture Research to head up <a title="Data Analytics and Obama" href="http://swampland.time.com/2012/11/07/inside-the-secret-world-of-quants-and-data-crunchers-who-helped-obama-win/" target="_blank">President Obama&#8217;s vaunted data analytics team</a>.</p>
<p style="text-align:justify;">At the 2012 ICML conference, Wagstaff wrote an <a title="Machine Learning that Matters" href="http://www.wkiri.com/research/papers/wagstaff-MLmatters-12.pdf" target="_blank">excellent little position paper</a> calling for &#8220;Machine Learning that Matters&#8221; (besides the paper, there&#8217;s also an <a title="Machine Learning impact Forum" href="http://www.mlimpact.com/">online forum</a>).  In it, she points out (with little argument from the gallery) that machine learning papers typically proceed as follows:  Find a dataset for a problem that is only partially solved, invent or improve an algorithm to solve this problem such that some improvement on some standard metric is made on that dataset, write it up and publish.  Repeat until tenured.</p>
<p style="text-align:justify;"><a href="http://littleml.files.wordpress.com/2013/06/mind2.png"><img class="aligncenter size-full wp-image-4500" alt="Find a dataset for a problem..." src="http://littleml.files.wordpress.com/2013/06/mind2.png?w=497&#038;h=198" width="497" height="198" /></a></p>
<p style="text-align:justify;">Wagstaff notes in her paper that, while this approach produces a substantial number of computer science professors, it isn&#8217;t nearly as successful at getting machine learning algorithms in a position to make a measurable difference in society at large.  At fault, she says, is both the data we use to test algorithms and the metrics we use to evaluate our tests.  Many of the datasets we use in the scientific literature in machine learning have been around for years or even decades.  Producing a small performance improvement on the vast majority of these datasets will have little effect outside of machine learning conferences.  And what do we even mean when we say &#8220;small performance improvement&#8221;?  If the AUC of my classifier on some dataset is 0.85 and that of yours is 0.81, what does that say about the consequences of using my classifier versus yours in the context of that data?  The answer, of course, is totally dependent on the data.  And if that data is meaningless, then we&#8217;re well on our way to producing technology useful to absolutely no one.</p>
<p style="text-align:justify;">She then goes on to list some of the challenges to socially relevant machine learning, and calls for a renewed focus on real-world data in the machine learning literature.  She also calls for metrics that reflect the real world impact of the technology (think, instead of &#8220;F-Measure&#8221;, of something like &#8220;gallons of water sanitized&#8221; or &#8220;doses of vaccine distributed&#8221;). Finally, she provides a very nice list of challenge problems for machine learning researchers, which we&#8217;ll get to later.  Definitely read the whole paper, which is very accessible to non-experts, to get the full-fat version of the argument.  For now, though, I&#8217;m going to play devil&#8217;s advocate a bit with Wagstaff&#8217;s excellent work.  I&#8217;m not doing this to be argumentative, as I mostly agree with everything she is saying.  Rather, I&#8217;m going to use it as a jumping off point to maybe inspire some of the work she&#8217;s hoping will happen.</p>
<h1>Sympathy for the Devil</h1>
<p style="text-align:justify;">The conundrum that Wagstaff points out is possibly as old as science itself. Dijkstra advises scientists to strive in their work for both <a title="Advice to a Young Scientist" href="http://www.cs.utexas.edu/users/EWD/transcriptions/EWD10xx/EWD1055A.html" target="_blank">&#8220;scientific soundness&#8221; and &#8220;social relevance&#8221;</a>.  Science, especially computer science, occupies a shadowy space between mathematics and engineering.  Like mathematicians, we would like our work to be abstract and general, to say something fundamental about optimization or, even better, about intelligence. Like engineers, however, we would also like to solve a specific problem in the world.  To be able to point to a cured patient, or an organized news feed, or a translated document and say &#8220;my algorithm did that&#8221;.</p>
<p style="text-align:justify;">And therein lies the problem:  The more abstract and general something is, the less specific it is. Machine learning scientists, both writers and reviewers, faced with the choice, tend towards the former. Part of the reason, I suspect, is that it&#8217;s easier to retain some degree of scientific objectivity when dealing with the abstract.  Suppose a paper proposes a classifier system that lowers the total cost of cancer screenings by half and only raises the rate of undetected cancers by 1%.  One suspects that reviewers who have lost loved ones to cancer may feel differently about this than ones who have not. Another part of the reason is a healthy suspicion of solutions that don&#8217;t generalize past a single case, as such niche solutions are <a title="The Three Golden Rules" href="http://www.cs.utexas.edu/~EWD/transcriptions/EWD06xx/EWD637.html" target="_blank">an important step on the road to charlatanry</a>.</p>
<p style="text-align:justify;">Nonetheless, science can and does impact the real world in very specific ways. The frustrating part about machine learning in particular is that we are so tantalizingly close to having our cake and eating it, too.  A century ago, Einstein labored at relativity with (probably) no thought to GPS satellites, so he could not have appreciated the <a title="Some applications of relativity" href="http://phys.org/news4798.html" target="_blank">full effect his work would have on society</a>. Scientists are accustomed to the idea that our work might not be useful to society for years after it occurs, and so there&#8217;s no stigma attached to producing work that is scientifically sound but not (yet) socially relevant.  But machine learning is so directly applicable to so many problems that at times it seems downright <em>easy</em> to find a landing zone for these algorithms. Yet, the traditional scientific mindset pulls us back to the abstract.  Even though the gap between theory and practice in machine learning seems nearly bridged, it looks like we still need a little help.</p>
<h1>Fixing a Hole</h1>
<p style="text-align:justify;">In Wagstaff&#8217;s paper, she lays out a program of challenges for machine learning researchers interested in making a difference in the real world:</p>
<ol>
<li>A law passed or legal decision made that relies on the result of an ML analysis.</li>
<li>$100M saved through improved decision making provided by an ML system.</li>
<li>A conﬂict between nations averted through high quality translation provided by an ML system.</li>
<li>A 50% reduction in cybersecurity break-ins through ML defenses.</li>
<li>A human life saved through a diagnosis or intervention recommended by an ML system.</li>
<li>Improvement of 10% in one country’s Human Development Index (HDI) attributable to an ML system</li>
</ol>
<p style="text-align:justify;">This is an excellent set of challenges, and I would guess that few or none will be solved by traditional machine learning scientists (with the exception of #5, which I think may have already been met depending on how one defines &#8220;saved&#8221;).  This isn&#8217;t because the problems themselves are terribly hard from a machine learning perspective (though they might be), but simply because we don&#8217;t have many machine learning researchers at high levels of government, or writing laws, or responsible for network security.  They have spent their careers studying machine learning, in all of its arcane mathematical glory.  To think they might also know enough about law to apply it in that context is asking a lot.  Who will come to their aid?</p>
<p><em><strong>You</strong></em>.</p>
<p style="text-align:justify;">At BigML, we&#8217;ve committed ourselves to putting machine learning technology into the hands of people who can meet the challenges that Wagstaff has put forth. You have the data and the expertise to know which problems in your field of interest can be solved by machine learning; we have the machine learning expertise to provide you with that solution.  I don&#8217;t think we need machine learning gurus to tackle Wagstaff&#8217;s problems, I think we need <strong>gurus from other fields who know enough about machine learning to use it</strong>. Wagstaff alludes to this in her paper. <em>If we can get usable, flexible, dependable machine learning software into the hands of domain experts, benefits to society are bound to follow.</em></p>
<h1>If I Had a Hammer</h1>
<p style="text-align:justify;">I often tell people that machine learning is a hammer.  Machine learning scientists are busy every year, every day, improving that hammer, making it more durable, harder, improving the shape, and on and on.  But as Wagstaff points out, that hammer is no good to anyone until someone starts hitting nails with it.  BigML is giving the world a chance to swing that hammer, whether you&#8217;re an expert in machine learning or anything else.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/littleml.wordpress.com/4486/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/littleml.wordpress.com/4486/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=4486&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.bigml.com/2013/06/13/matter-over-mind-in-machine-learning/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/7dd9c63ce59f4ee73bbee56cdc44fe8d?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">charleslparker</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/06/mind.jpg" medium="image">
			<media:title type="html">Matter over Mind in Machine Learning</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/06/mind2.png" medium="image">
			<media:title type="html">Find a dataset for a problem...</media:title>
		</media:content>
	</item>
		<item>
		<title>Quandl + BigML = Powerful financial, economic, and social predictive models</title>
		<link>http://blog.bigml.com/2013/05/31/quandl-bigml-powerful-financial-economic-and-social-predictive-models/</link>
		<comments>http://blog.bigml.com/2013/05/31/quandl-bigml-powerful-financial-economic-and-social-predictive-models/#comments</comments>
		<pubDate>Fri, 31 May 2013 00:04:51 +0000</pubDate>
		<dc:creator>candidozuriaga</dc:creator>
				<category><![CDATA[API]]></category>
		<category><![CDATA[BigML.io]]></category>
		<category><![CDATA[BigMLer]]></category>
		<category><![CDATA[Data]]></category>
		<category><![CDATA[Partners]]></category>
		<category><![CDATA[Streaming Data]]></category>

		<guid isPermaLink="false">http://blog.bigml.com/?p=4334</guid>
		<description><![CDATA[Quandl is a huge repository of financial, economic and social datasets. Registration and use of Quandl is currently, and will always be free. Quandl can be used on the web and/or through a public API. Most of Quandl’s datasets are univariate, which provide interesting insight and lend themselves to interesting time-series forecasting models.  In addition, there’s [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=4334&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p style="text-align:justify;"><a style="text-align:justify;font-size:13px;" href="http://www.quandl.com" target="_blank">Quandl</a><span style="text-align:justify;font-size:13px;"> </span>is a huge repository of financial, economic and social datasets. Registration and use of Quandl is currently, and will always be free. Quandl can be used on the web and/or through a public API.</p>
<p style="text-align:center;"><a href="http://www.quandl.com"><img class="size-full wp-image-4432 aligncenter" alt="www.quandl.com" src="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-30-a-las-03-52-33.png?w=497&#038;h=291" width="497" height="291" /></a></p>
<p style="text-align:justify;">Most of Quandl’s datasets are univariate, which provide interesting insight and lend themselves to interesting time-series forecasting models.  In addition, there’s a great utility in which you can combine columns from different datasets to create a more complex item called a <strong><a href="http://www.quandl.com/help/supersets" target="_blank">Superset</a></strong>. BigML works very well with these multivariate Supersets.</p>
<p style="text-align:justify;">You can choose several columns from different datasets, and build a custom Superset.  If the frequencies in your Superset are different (e.g., if one column tracks data monthly, while the other does daily) the Superset will normalize your data, adjusting to the lowest frequency (e.g., monthly instead of daily).</p>
<h1>Creating a Superset</h1>
<p style="text-align:justify;">1. Locate the different datasets that contain the columns you want to add to your Superset:</p>
<p style="text-align:justify;"><a href="http://www.quandl.com/FRED-Federal-Reserve-Economic-Data/POP-Total-Population-All-Ages-including-Armed-Forces-Overseas"><img class="alignnone size-medium wp-image-4385" style="margin:0;" alt="Total Population" src="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-28-a-las-19-59-06.png?w=113&#038;h=300" width="113" height="300" /></a> <a href="http://www.quandl.com/FRED-Federal-Reserve-Economic-Data/ALTSALES-Light-Weight-Vehicle-Sales-Autos-Light-Trucks"><img class="alignnone size-medium wp-image-4388" style="margin:0;" alt="Vehicle sales" src="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-28-a-las-20-00-58.png?w=103&#038;h=300" width="103" height="300" /></a> <a href="http://www.quandl.com/FRED-Federal-Reserve-Economic-Data/TOTALSL-Total-Consumer-Credit-Owned-and-Securitized-Outstanding"><img class="alignnone size-medium wp-image-4391" style="margin:0;" alt="Consumer Credit" src="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-28-a-las-20-10-06.png?w=112&#038;h=300" width="112" height="300" /></a> <a href="http://www.quandl.com/FRED-Federal-Reserve-Economic-Data/HSN1F-New-One-Family-Houses-Sold-United-States"><img class="alignnone size-medium wp-image-4386" style="margin:0;" alt="Houses sold" src="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-28-a-las-19-59-54.png?w=101&#038;h=300" width="101" height="300" /></a></p>
<p style="text-align:justify;">2. Right click on the column header, and add the column to an existing Superset, or create a new one when adding the first field.</p>
<p style="text-align:justify;">3. Be sure to check the fields&#8217; frequency, because the final Superset frequency will be the lowest frequency of all the fields added.</p>
<p style="text-align:justify;">See below a screenshot from a <a title="US Superset" href="http://www.quandl.com/USER_1O2-Czuriaga/1O6-US-Quandl-Superset" target="_blank">sample Superset</a>, created from several datasets about economic and demographic indicators in the United States.</p>
<div id="attachment_4394" class="wp-caption alignnone" style="width: 522px"><a href="http://www.quandl.com/USER_1O2-Czuriaga/1O6-US-Quandl-Superset"><img class=" wp-image-4394" alt="Custom Superset" src="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-28-a-las-19-58-281.png?w=512&#038;h=250" width="512" height="250" /></a><p class="wp-caption-text">Quandl Superset</p></div>
<h1>Use your Superset in BigML</h1>
<p style="text-align:justify;">Once the Superset is created and columns are renamed with the desired labels, we can proceed in one of four different ways to upload the result to <a href="https://bigml.com" target="_blank">BigML</a>:</p>
<ol>
<li style="text-align:justify;">Click on <strong>Download</strong> button in Quandl and export as .CSV. This file is now ready to be uploaded to BigML as new Source. (For details on how to upload files into BigML, please visit <a href="http://bigml.com/how_it_works" target="_blank">here</a>, and follow the links to our helpful videos)<a href="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-30-a-las-15-17-45.png"><img class="size-full wp-image-4444 aligncenter" style="margin:10px 0;" alt="BigML Upload Source" src="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-30-a-las-15-17-45.png?w=497&#038;h=325" width="497" height="325" /></a></li>
<li style="text-align:justify;">Get the <a href="http://www.quandl.com/api/v1/datasets/USER_1O2/1O6.csv" target="_blank">direct link</a>, by clicking the <em>&#8220;show API call&#8221;</em> in the Download modal window within Quandl, below the <strong>Download Data</strong> button.<a href="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-30-a-las-15-35-21.png"><img class="size-full wp-image-4446 aligncenter" style="margin:10px 0;" alt="Captura de pantalla 2013-05-30 a la(s) 15.35.21" src="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-30-a-las-15-35-21.png?w=497&#038;h=212" width="497" height="212" /></a>Then use this link as external source in BigML:<a href="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-30-a-las-15-20-13.png"><img class="size-full wp-image-4443 aligncenter" style="margin:10px 0;" alt="bigML Upload Remote Source" src="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-30-a-las-15-20-13.png?w=497&#038;h=343" width="497" height="343" /></a></li>
<li style="text-align:justify;">Or use the Quandl link to upload the source using our public <a href="https://bigml.com/developers" target="_blank">API</a>:<a href="https://bigml.com/developers" target="_blank"><br />
</a>
<pre class="brush: bash; collapse: false; highlight: [1,2,3,4,5,6,7,8,9,10,11,12]; light: true; title: ; wrap-lines: false; notranslate">
curl --silent https://bigml.io/source?$BIGML_AUTH \
     -X &quot;POST&quot; \
     -H &quot;content-type: application/json&quot; \
     -d '{&quot;remote&quot;: &quot;http://www.quandl.com/api/v1/datasets/USER_1O2/1O6.csv&quot;, &quot;name&quot;: &quot;USA Unemployment Rate&quot;}'
</pre>
</li>
<li style="text-align:justify;">Or you can upload the Source, generate the Dataset and create the Model directly with a single <a href="https://github.com/bigmlcom/bigmler" target="_blank">bigmler</a> call (be sure to put the objective field at the end): 
<pre class="brush: bash; collapse: false; highlight: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]; light: true; title: ; wrap-lines: false; notranslate">
bigmler --train &quot;http://www.quandl.com/api/v1/datasets/USER_1O2/1O6.csv&quot; \
        --name &quot;USA Unemployment Rate&quot; --tag &quot;US Indicators&quot;
[2013-05-25 03:29:11] Creating source.
[2013-05-25 03:29:22] Source created: https://bigml.com/dashboard/source/51a013e9925ded36f4000103
[2013-05-25 03:29:22] Creating dataset.
[2013-05-25 03:29:25] Dataset created: https://bigml.com/dashboard/dataset/51a013f1925ded36f3000310
[2013-05-25 03:29:25] Creating model.
[2013-05-25 03:29:31] Model created: https://bigml.com/dashboard/model/51a013f5925ded36f3000314.

Generated files:

SatMay2513_032911
├─bigmler_sessions
├─dataset
├─models
└─source
</pre>
</li>
</ol>
<p style="text-align:justify;">And finally the <a href="http://bml.io/11hYExv" target="_blank">model</a> is created in BigML.  In the view below the decision tree visualization has the unemployment rate as the predicted field.  In this case, unemployment is predicted at 5.97%, based on the fact that there are less than 14,549,000 government employees, the civilian employment ratio is between 58.75%-61.83%, population is less than 226,954,000, the US Dollar index is lower than 98.69, personal income is less than $2.192B, the average new house prices is greater than $62,950, the month is later than July, and monthly new vehicle sales are less than $14.92M.</p>
<p><a href="http://bml.io/11hYExv"><img class="alignnone size-full wp-image-4399" alt="Captura de pantalla 2013-05-28 a la(s) 20.23.09" src="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-28-a-las-20-23-09.png?w=497&#038;h=406" width="497" height="406" /></a></p>
<p style="text-align:justify;">You can also analyze your model using BigML&#8217;s sunburst visualization&#8212;which has three viewing options:  Split Field, Prediction and Confidence. Shown below we see the same finding (from the decision tree above) in the Prediction view, where darker means a higher result.<br />
<a href="http://bml.io/11hYExv"><img class="alignnone size-full wp-image-4469" alt="Sunburst visualization" src="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-30-a-las-18-41-42.png?w=497&#038;h=446" width="497" height="446" /></a></p>
<p style="text-align:justify;">We&#8217;re just getting started in this collaboration with Quandl&#8212;stay tuned for more updates and innovations!  In the interim, if you build a model with a Quandl superset, please <a href="mailto:info@bigml.com" target="_blank">let us know</a>.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/littleml.wordpress.com/4334/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/littleml.wordpress.com/4334/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=4334&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.bigml.com/2013/05/31/quandl-bigml-powerful-financial-economic-and-social-predictive-models/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/fd79b3f2d489f784da5f4057e513621b?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">candidozuriaga</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-30-a-las-03-52-33.png" medium="image">
			<media:title type="html">www.quandl.com</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-28-a-las-19-59-06.png?w=113" medium="image">
			<media:title type="html">Total Population</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-28-a-las-20-00-58.png?w=103" medium="image">
			<media:title type="html">Vehicle sales</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-28-a-las-20-10-06.png?w=112" medium="image">
			<media:title type="html">Consumer Credit</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-28-a-las-19-59-54.png?w=101" medium="image">
			<media:title type="html">Houses sold</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-28-a-las-19-58-281.png" medium="image">
			<media:title type="html">Custom Superset</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-30-a-las-15-17-45.png" medium="image">
			<media:title type="html">BigML Upload Source</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-30-a-las-15-35-21.png" medium="image">
			<media:title type="html">Captura de pantalla 2013-05-30 a la(s) 15.35.21</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-30-a-las-15-20-13.png" medium="image">
			<media:title type="html">bigML Upload Remote Source</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-28-a-las-20-23-09.png" medium="image">
			<media:title type="html">Captura de pantalla 2013-05-28 a la(s) 20.23.09</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/captura-de-pantalla-2013-05-30-a-las-18-41-42.png" medium="image">
			<media:title type="html">Sunburst visualization</media:title>
		</media:content>
	</item>
		<item>
		<title>BigML—speaking near you!</title>
		<link>http://blog.bigml.com/2013/05/29/bigml-speaking-near-you/</link>
		<comments>http://blog.bigml.com/2013/05/29/bigml-speaking-near-you/#comments</comments>
		<pubDate>Wed, 29 May 2013 20:46:52 +0000</pubDate>
		<dc:creator>andrewshikiar</dc:creator>
				<category><![CDATA[News & Events]]></category>
		<category><![CDATA[Barcelona]]></category>
		<category><![CDATA[BigML]]></category>
		<category><![CDATA[Chicago]]></category>
		<category><![CDATA[London]]></category>
		<category><![CDATA[machine learning]]></category>
		<category><![CDATA[Meetup]]></category>
		<category><![CDATA[Predictive Analytics World]]></category>
		<category><![CDATA[San Francisco Bay Area]]></category>

		<guid isPermaLink="false">http://blog.bigml.com/?p=4405</guid>
		<description><![CDATA[Over the next few weeks BigML will be speaking and/or have presence at a few events around the globe.  Please come to our speaking sessions and/or reach out to us if you’ll be in the area and want to meet in person! Barcelona On May 30, 3:00PM-5:00PM, BigML’s CTO and Hacker at Large jao will [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=4405&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p style="text-align:justify;">Over the next few weeks BigML will be speaking and/or have presence at a few events around the globe.  Please come to our speaking sessions and/or <a href="http://bigml.com/contact">reach out</a> to us if you’ll be in the area and want to meet in person!</p>
<h2 style="text-align:justify;"><strong>Barcelona</strong></h2>
<p style="text-align:justify;">On May 30, 3:00PM-5:00PM, BigML’s CTO and Hacker at Large <a href="http://es.linkedin.com/in/jaortega" target="_blank">jao</a> will give a guest lecture on “<a href="http://www.jorditorres.org/teaching/research-master/clc-miri-2013-big-data-part/" target="_blank">Learning from Big Data Using the Cloud</a>” to students at the Universitat Politecnica de Catalunya (<a href="https://maps.google.com/maps?q=Carrer+Jordi+Girona,+1,+UPCnet,+Universitat+Polit%C3%A8cnica+de+Catalunya,+Campus+Nord,+Barcelona,+Spain&amp;hl=en&amp;ie=UTF8&amp;sll=37.0625,-95.677068&amp;sspn=49.176833,86.923828&amp;hnear=Carrer+Jordi+Girona,+1,+08034+Barcelona,+Spain&amp;t=m&amp;z=16">located in North Campus, Building 204</a>). This lecture is also free to attend.</p>
<p style="text-align:justify;"><a title="Learning from Big Data Using the Cloud" href="http://www.jorditorres.org/teaching/research-master/clc-miri-2013-big-data-part/" target="_blank"><img class="aligncenter size-large wp-image-4414" alt="jao" src="http://littleml.files.wordpress.com/2013/05/jao.jpg?w=497&#038;h=279" width="497" height="279" /></a></p>
<h2 style="text-align:justify;"><strong>San Francisco Bay Area</strong></h2>
<p style="text-align:justify;">On June 3, 6:30-8:30PM, BigML’s CEO <a href="http://www.linkedin.com/in/cisko">Dr. Francisco J Martin</a> will speak on “<strong>A Few Challenges to Make Machine Learning Easy</strong>” at a meeting hosted by the <a title="ACM San Francisco Bay Area Professional Chapter" href="http://www.sfbayacm.org/event/save-date-topic-tbd-6" target="_blank">ACM San Francisco Bay Area Chapter</a>.  The meeting will take place at the <a href="http://maps.google.com/maps?q=2065+Hamilton+Ave%2C+San+Jose%2C+CA">eBay Whitman Campus</a> and is <a title="A few Challenges to Make Machine Learning Easy" href="http://www.meetup.com/SF-Bay-ACM/events/119730122/" target="_blank">free to attend</a>.</p>
<p style="text-align:justify;"><a title="ACM San Francisco Bay Area Chapter" href="http://www.meetup.com/SF-Bay-ACM/events/119730122/" target="_blank"><img class="aligncenter size-large wp-image-4417" alt="fm2" src="http://littleml.files.wordpress.com/2013/05/fm2.jpg?w=497&#038;h=331" width="497" height="331" /></a></p>
<h2 style="text-align:justify;"><strong>London</strong></h2>
<p style="text-align:justify;">On June 4, BigML’s VP of Corporate Development <a href="http://www.linkedin.com/in/andrewshikiar/" target="_blank">Andrew Shikiar</a> will present as part of the <a href="http://agcpartners.com/events/agc-partners-first-european-growth-conference/" target="_blank">AGC Partners’ European Growth Conference</a>.  You can request an invitation from AGC Partners and/or contact us.</p>
<p style="text-align:justify;"><a title="AGC Partners’ European Growth Conference" href="http://agcpartners.com/events/agc-partners-first-european-growth-conference/" target="_blank"><img class="aligncenter size-large wp-image-4419" alt="agc_conference" src="http://littleml.files.wordpress.com/2013/05/agc_conference.png?w=497&#038;h=223" width="497" height="223" /></a></p>
<h2 style="text-align:justify;"><strong>Chicago</strong></h2>
<p style="text-align:justify;">BigML will have representatives attending <a href="http://www.predictiveanalyticsworld.com/chicago/2013/">Predictive Analytics World</a> on June 11-12. If you’re attending and want to meet up please let us know!</p>
<p style="text-align:justify;"><a title="Predictive Analytics World" href="http://www.predictiveanalyticsworld.com/chicago/2013/" target="_blank"><img class="aligncenter size-large wp-image-4422" alt="paworld" src="http://littleml.files.wordpress.com/2013/05/paworld.jpg?w=497&#038;h=175" width="497" height="175" /></a></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/littleml.wordpress.com/4405/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/littleml.wordpress.com/4405/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=4405&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.bigml.com/2013/05/29/bigml-speaking-near-you/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/4c5f87730b1b1812b037dc32b12b0f3a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">ashikiar</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/jao.jpg?w=497" medium="image">
			<media:title type="html">jao</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/fm2.jpg?w=497" medium="image">
			<media:title type="html">fm2</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/agc_conference.png?w=497" medium="image">
			<media:title type="html">agc_conference</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/paworld.jpg?w=497" medium="image">
			<media:title type="html">paworld</media:title>
		</media:content>
	</item>
		<item>
		<title>Democratizing Machine Learning One Step at a Time</title>
		<link>http://blog.bigml.com/2013/05/28/democratizing-machine-learning-one-step-at-a-time/</link>
		<comments>http://blog.bigml.com/2013/05/28/democratizing-machine-learning-one-step-at-a-time/#comments</comments>
		<pubDate>Tue, 28 May 2013 07:11:55 +0000</pubDate>
		<dc:creator>franciscojmartin</dc:creator>
				<category><![CDATA[Business]]></category>
		<category><![CDATA[New Features]]></category>
		<category><![CDATA[Start up]]></category>
		<category><![CDATA[invite friends]]></category>
		<category><![CDATA[subscription plans]]></category>

		<guid isPermaLink="false">http://blog.bigml.com/?p=4313</guid>
		<description><![CDATA[At BigML we are working hard to make machine learning easy and accessible. We are happy to see a growing number of customers using BigML to solve very diverse problems from detecting fraud to computing risk to predicting consumer behavior and customer churn. We believe that one of the keys to fully democratize machine learning [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=4313&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p style="text-align:justify;">At BigML we are working hard to make <em><strong>machine learning easy</strong></em> and <strong><em>accessible</em></strong>. We are happy to see a growing number of customers using BigML to solve very diverse problems from detecting fraud to computing risk to predicting consumer behavior and customer churn.</p>
<p style="text-align:justify;">We believe that one of the keys to fully democratize machine learning is making it very <em><strong>affordable</strong></em>. Many current machine learning solutions are not only costly in terms of complexity, time-to-productivity, configuration, installation, and hardware but also in terms of the licensing costs themselves. We&#8217;ve priced BigML using an affordable <strong>pre-paid model</strong>: with <strong>no licensing fees</strong>, <strong>no monthly fees</strong>, and we&#8217;ve even thrown in a <strong>free development mode</strong>. You just pre-pay credits that you can consume as you go. However, we have seen that when users become more familiar with BigML they want to use it to solve many other data-driven tasks, keep their models up-to-date, and create ensembles that may have hundreds of models.</p>
<p style="text-align:justify;">For these users, we&#8217;ve been working on a <strong>new subscription model</strong> that will allow our customers to use BigML <strong>as much as they want</strong> for a monthly, quarterly, or yearly fee. The only limitations will be the <strong>maximum size of tasks</strong> and the <strong>number of parallel tasks</strong>. Customers will be able to select a specific plan or request a personalized one if their needs are different.</p>
<p style="text-align:justify;"><a href="http://littleml.files.wordpress.com/2013/05/bigml_free_pro_plan.png"><img class="aligncenter size-large wp-image-4439" alt="bigml_free_pro_plan" src="http://littleml.files.wordpress.com/2013/05/bigml_free_pro_plan.png?w=497&#038;h=175" width="497" height="175" /></a></p>
<p style="text-align:justify;">BigML subscription plans will be ready very soon. Actually, they are ready but we are still deciding what the right max size, max parallel tasks, and price for each different plan might be. As such, we would like to get  a few users to try our subscription plans for free before setting up the initial parameters. We thought a great way to decide which users should get free subscriptions was by getting them to invite some other friends.</p>
<p style="text-align:justify;">If you <a title="Recommend BigML" href="https://bigml.com/account/recommend"><strong>invite three friends</strong></a> that sign up for free BigML accounts, you&#8217;ll get a free month subscription. With a free subscription you will be able to run unlimited tasks to create unlimited resources (datasets, models, ensembles, evaluations, and predictions). We&#8217;ll be limiting size and number of parallel tasks but don&#8217;t worry if you feel that our initial limitations are too strong for your current needs we&#8217;ll be happy to upgrade them. You&#8217;ll be able to get up to a 3-month free subscription plan if you manage to get nine friends to sign up.</p>
<p><a href="http://littleml.files.wordpress.com/2013/05/bigml_invite_friends.png"><img class="aligncenter size-large wp-image-4322" alt="bigml_invite_friends" src="http://littleml.files.wordpress.com/2013/05/bigml_invite_friends.png?w=497&#038;h=318" width="497" height="318" /></a></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/littleml.wordpress.com/4313/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/littleml.wordpress.com/4313/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=4313&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.bigml.com/2013/05/28/democratizing-machine-learning-one-step-at-a-time/feed/</wfw:commentRss>
		<slash:comments>3</slash:comments>
	
		<media:content url="http://2.gravatar.com/avatar/e790f8968767fdb7e3bf32e76e7102f6?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">franciscojmartin</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/bigml_free_pro_plan.png?w=497" medium="image">
			<media:title type="html">bigml_free_pro_plan</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/bigml_invite_friends.png?w=497" medium="image">
			<media:title type="html">bigml_invite_friends</media:title>
		</media:content>
	</item>
		<item>
		<title>Three key steps on the Machine Learning stairway from the command line to the Cloud</title>
		<link>http://blog.bigml.com/2013/05/24/three-key-steps-on-the-machine-learning-stairway-from-the-command-line-to-the-cloud/</link>
		<comments>http://blog.bigml.com/2013/05/24/three-key-steps-on-the-machine-learning-stairway-from-the-command-line-to-the-cloud/#comments</comments>
		<pubDate>Fri, 24 May 2013 17:12:42 +0000</pubDate>
		<dc:creator>martinprats</dc:creator>
				<category><![CDATA[API]]></category>
		<category><![CDATA[BigML.io]]></category>
		<category><![CDATA[BigMLer]]></category>
		<category><![CDATA[Ensembles]]></category>
		<category><![CDATA[Machine Learning]]></category>
		<category><![CDATA[cross-validation]]></category>
		<category><![CDATA[evaluations]]></category>
		<category><![CDATA[splitting datasets]]></category>

		<guid isPermaLink="false">http://blog.bigml.com/?p=4159</guid>
		<description><![CDATA[Last time we talked about BigMLer, we saw that the list of BigML resources manageable from the command line included sources, datasets, models, predictions and evaluations. Since then, we&#8217;ve been working hard on BigMLer to bring even more cloud-based Machine Learning power to the comfort of your local computer. In this post we introduce the [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=4159&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p style="text-align:justify;"><a href="http://littleml.files.wordpress.com/2013/05/steps.jpg"><img class="alignleft  wp-image-4208" style="margin:0 15px 10px 0;" title="Three key steps on the Machine Learning stairway from the command line to the Cloud" alt="steps" src="http://littleml.files.wordpress.com/2013/05/steps.jpg?w=245&#038;h=245" width="245" height="245" /></a>Last time we talked about <a title="BigMLer" href="https://github.com/bigmlcom/bigmler">BigMLer</a>, we saw that the list of BigML resources manageable from the command line included <a title="sources" href="https://bigml.com/developers/sources">sources</a>, <a title="datasets" href="https://bigml.com/developers/datasets">datasets</a>, <a title="models" href="https://bigml.com/developers/models">models</a>, <a title="predictions" href="https://bigml.com/developers/predictions">predictions</a> and <a title="evaluations" href="https://bigml.com/developers/evaluations">evaluations</a>. Since then, we&#8217;ve been working hard on BigMLer to bring even more cloud-based Machine Learning power to the comfort of your local computer.</p>
<p style="text-align:justify;">In this post we introduce the three latest additions to <strong>BigMLer:</strong> <strong>dataset splitting, <strong>cross-validation</strong></strong> and <strong>ensembles</strong>.</p>
<h1>Datasets split into datasets:</h1>
<p style="text-align:justify;">As we saw in previous BigMLer&#8217;s blog posts, the <code>--sample-rate</code> flag lets you control the fraction of data that will be used to build models and, consequently, what percentage is left for evaluations. However, to have better control of your test data you may want to <a title="permanently separate a group of instances from your dataset" href="http://blog.bigml.com/2013/05/23/get-your-training-and-test-sets-in-just-one-click/"><strong>permanently separate a group of instances from your dataset</strong></a>. OK, says BigMLer, let&#8217;s split your dataset:</p>
<pre class="brush: bash; collapse: false; highlight: [1]; light: true; title: ; wrap-lines: false; notranslate">
bigmler --train data/iris.csv --test-split 0.2 --evaluate
</pre>
<p style="text-align:justify;">in a single line you will</p>
<ul style="text-align:justify;">
<li>create a dataset with the entire data</li>
<li>split the dataset into a test one, holding 20% of the instances, and a training one with the remaining 80%</li>
<li>create a model with the training dataset</li>
<li>evaluate this model with the test dataset</li>
</ul>
<p style="text-align:justify;">Your training and test data will then be accessible as regular dataset objects that can be recovered independently or even split again. If you have a look at the command&#8217;s output, you&#8217;ll see <em>dataset_train </em>and <em>dataset_test</em> files, which contain the ids for the generated splits. Let&#8217;s say that your training dataset is <code>dataset/5188ddfa37203f085c000008</code> and your test dataset is<br />
<code>dataset/5188ddfd37203f085c00000b</code>. Then if you want to try the same data but in a model built with statistical pruning:</p>
<pre class="brush: bash; collapse: false; highlight: [1,2,3,4,5,6,7,8,9,10,11,12]; light: true; title: ; wrap-lines: false; notranslate">
bigmler --dataset dataset/5188ddfa37203f085c000008 \
        --pruning statistical
</pre>
<p>will build the new model <code>model/518954b437203f1a6f000000</code> and</p>
<pre class="brush: bash; collapse: false; highlight: [1,2]; light: true; title: ; wrap-lines: false; notranslate">
bigmler --dataset dataset/5188ddfd37203f085c00000b \
        --model model/518954b437203f1a6f000000 --evaluate
</pre>
<p style="text-align:justify;">will use the same test dataset for the evaluation. Then you&#8217;ll be able to compare your models and choose the one that performs best.</p>
<p style="text-align:justify;">While splitting a dataset once for evaluating a model is good, sometimes you want to create a handful of different splits to avoid basing your conclusions on a biased sample. Here again, <strong>BigMLer</strong> can help.</p>
<h1>Tuning your models: cross-validation</h1>
<p style="text-align:justify;">Usually, when you build a <strong>model</strong>, you can adjust some parameters to improve the model&#8217;s performance. For instance, BigML lets you choose between several pruning modes. Each choice will give you a different decision tree, but how do you know which one works better for your data? <a title="cross-validation" href="http://en.wikipedia.org/wiki/Cross-validation_(statistics)" target="_blank">Cross-validation</a> will help you with that. As you may know, cross-validation is a technique that helps you estimate the validity of your model using samples of your training data. Nowadays, several types of cross-validation are normally used, but all of them proceed by building many models with samples of your training data and testing them with the data held out during the sampling.</p>
<p style="text-align:justify;">In <strong>BigMLer </strong>we&#8217;ve chosen a Monte-Carlo variant of the cross-validation algorithm, which repeatedly splits at random the dataset into training and test subsets, builds the corresponding models and evaluates them, averaging the results. In this kind of algorithm, there&#8217;s no <em>a priori</em> relation between the size of the sample you use to evaluate and the number of evaluations you can run to validate (besides the fact that they should be enough to ensure good coverage of your data).  Never mind, you just have to say:</p>
<pre class="brush: bash; collapse: false; highlight: [1]; light: true; title: ; wrap-lines: false; notranslate">bigmler --train data/iris.csv --cross-validation-rate 0.1</pre>
<p style="text-align:justify;">and the tool will take care of the job, namely:</p>
<ul>
<li><span style="font-size:13px;line-height:19px;">create a dataset with all of the training data</span></li>
<li><span style="font-size:13px;line-height:19px;">hold out a random sample of 10% of the dataset to run evaluations</span></li>
<li><span style="font-size:13px;line-height:19px;">use the remaining 90% to build a partial model</span></li>
<li><span style="font-size:13px;line-height:19px;">evaluate the model with the held out data</span></li>
<li><span style="font-size:13px;line-height:19px;">repeat the previous steps using a different random sampling <em>2 * n</em> times, where <em>n</em> is the percentage of held out data (in this case, 20 runs), to reduce variance</span></li>
<li><span style="font-size:13px;line-height:19px;">finally, average all the partial model&#8217;s evaluations to get a close estimation of the complete model&#8217;s performance</span></li>
</ul>
<p>Or, if you want to choose the number of evaluations, use the <code>--number-of-evaluations</code> flag:</p>
<pre class="brush: bash; collapse: false; highlight: [1,2]; light: true; title: ; wrap-lines: false; notranslate">
bigmler --train data/iris.csv --cross-validation-rate 0.1 \
        --number-of-evaluations 20</pre>
<p style="text-align:justify;"><strong>BigMLer</strong> will store the results in an output directory (see <a title="BigMLer: evaluations" href="http://blog.bigml.com/2013/01/31/fly-your-ml-cloud-like-a-kite-with-bigmler-the-command-line-tool-for-machine-learning/">BigMLer&#8217;s last blog post</a> for details) where you will find <em>cross_validation.json</em> and <em>cross_validation.txt</em> files, which will contain the average of all the models&#8217; <strong>evaluations</strong>.</p>
<p style="text-align:justify;">Of course,  sometimes a single model can fail to perform well for your data. Again, <strong>BigMLer</strong> has a solution for you.</p>
<h1>Models working together: ensembles</h1>
<p style="text-align:justify;"><a title="ensembles" href="https://bigml.com/developers/ensembles">Ensembles</a> are groups of models built by sampling a single dataset. Thus, the models in an ensemble are all different but similar enough, as they are built on a common part of information. As we saw in detail in <a title="The Three Cardinal Virtues of Ensemble Learning" href="http://blog.bigml.com/2013/04/04/the-three-cardinal-virtues-of-ensemble-learning/">previous posts</a>,<strong> ensembles&#8217; predictions are usually more accurate than those from a single model</strong> because the ensemble&#8217;s diversity helps smooth out small variations in the individual models, reinforcing their shared features. If you&#8217;ve read our <a title="BigMLer: ensembles" href="http://blog.bigml.com/2012/12/07/bigmler-in-da-cloud-machine-learning-made-even-easier/">first BigMLer post</a> you&#8217;ll be familiar with ensembles, as they were available in <strong>BigMLer</strong> long before they made their appearance in the <a title="BigML's python bindings" href="https://github.com/bigmlcom/python/">new version of BigML&#8217;s python bindings</a>. Nevertheless, now that <a title="ensembles are first class citizens" href="http://blog.bigml.com/2013/04/29/1-click-random-decision-forests/">ensembles are first class citizens </a> in the BigML API, <strong>BigMLer</strong> has adapted to handle them as one of your regular resources.</p>
<p style="text-align:justify;">For example, say you created <code>ensemble/51630c4e37203f2292000082</code> and want to use it to generate your predictions locally. Just let <strong>BigMLer</strong> do it:</p>
<pre class="brush: bash; collapse: false; highlight: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24]; light: true; title: ; wrap-lines: false; notranslate">
bigmler --ensemble ensemble/51630c4e37203f2292000082 \
        --test data/test_iris.csv
</pre>
<p style="text-align:justify;">Using this command, the ensemble information is downloaded and the model predictions are combined into a final prediction stored in predictions.csv.</p>
<p style="text-align:justify;">Similarly, evaluating the ensemble with test data amounts to:</p>
<pre class="brush: bash; collapse: false; highlight: [1,2]; light: true; title: ; wrap-lines: false; notranslate">
bigmler --ensemble ensemble/51630c4e37203f2292000082 \
        --test data/iris.csv --evaluate
</pre>
<p style="text-align:justify;">and you&#8217;ll find the usual <em>evaluation.json</em> and <em>evaluation.txt</em> in the output directory. How does that sound to you?</p>
<p style="text-align:justify;">To sum up, <strong>BigMLer</strong> now includes new features like<strong> dataset splitting</strong>, <strong>model&#8217;s cross-validation</strong> and <strong>ensembles&#8217; predictions and evaluations</strong> to help you get the best of your data. Want something else? <a title="bigml.com" href="mailto:feedback@bigml.com">Let us know</a> and stay tuned!</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/littleml.wordpress.com/4159/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/littleml.wordpress.com/4159/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=4159&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.bigml.com/2013/05/24/three-key-steps-on-the-machine-learning-stairway-from-the-command-line-to-the-cloud/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="http://2.gravatar.com/avatar/e62dffcbe445e187f96f5e3ec06cce0e?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">martinprats</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/steps.jpg?w=300" medium="image">
			<media:title type="html">Three key steps on the Machine Learning stairway from the command line to the Cloud</media:title>
		</media:content>
	</item>
		<item>
		<title>Get your training and test sets in just one click</title>
		<link>http://blog.bigml.com/2013/05/23/get-your-training-and-test-sets-in-just-one-click/</link>
		<comments>http://blog.bigml.com/2013/05/23/get-your-training-and-test-sets-in-just-one-click/#comments</comments>
		<pubDate>Thu, 23 May 2013 21:12:14 +0000</pubDate>
		<dc:creator>franciscojmartin</dc:creator>
				<category><![CDATA[API]]></category>
		<category><![CDATA[Development]]></category>
		<category><![CDATA[New Features]]></category>

		<guid isPermaLink="false">http://blog.bigml.com/?p=3986</guid>
		<description><![CDATA[Testing or evaluating a predictive model involves using the model to generate predictions for a test set  and then computing a number of performance measures or metrics like accuracy, precision, recall, etc. These metrics estimate how well the model will perform when making predictions for instances that haven&#8217;t been used to train the model. You can read more about them [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=3986&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p style="text-align:justify;">Testing or evaluating a predictive model involves using the model to generate predictions for a <a title="Test set" href="http://en.wikipedia.org/wiki/Test_set" target="_blank">test set</a>  and then computing a number of performance measures or metrics like <a title="Accuracy and Precision " href="http://en.wikipedia.org/wiki/Accuracy_and_precision" target="_blank">accuracy, precision, recall</a>, etc. These metrics estimate how well the model will perform when making predictions for instances that haven&#8217;t been used to train the model. You can read more about them in <a title="Predicting with My Model:  Is It Safe?" href="http://blog.bigml.com/2012/12/03/predicting-with-my-model-is-it-safe/">our post</a> on how to  perform automatic evaluations in BigML. But where to find a test set that is representative of the instances that the predictive model will face in production?</p>
<p style="text-align:justify;">A traditional approach is to split the available data into two disjoint sets, one for training and one for tests. If your programming skills are up to the task you could use, say, <a title="Training and Test sets in Weka" href="http://weka.wikispaces.com/How+do+I+divide+a+dataset+into+training+and+test+set%3F" target="_blank">Weka</a>, <a title="Training and test set split in Scikit" href="http://scikit-learn.org/dev/modules/generated/sklearn.cross_validation.train_test_split.html" target="_blank">Scikit</a>, <a title="Training and test set split in R" href="http://gettinggeneticsdone.blogspot.com/2011/02/split-data-frame-into-testing-and.html" target="_blank">R</a>, or <a title="Training and test sets in SQL Server" href="http://msdn.microsoft.com/en-us/library/bb895173.aspx">SQLS</a> to perform the split. But, as we explain in this post, BigML offers you a much easier way to automatically perform this task with just one click or, if you use our API, a couple of HTTP requests.</p>
<p><a href="http://littleml.files.wordpress.com/2013/05/training_test1.jpg"><img class="aligncenter" alt="1-click Train | Test dataset splits" src="http://littleml.files.wordpress.com/2013/05/training_test1.jpg?w=497&#038;h=328" width="497" height="328" /></a></p>
<p style="text-align:justify;">On the surface, splitting the instances of a dataset into training and test sets seems like an easy task, for example just take the first 80% as the training set and the rest as the test set. But what if the dataset is ordered according to non-random criteria? Let&#8217;s imagine that the instances are ordered by the field representing the class that you want to predict.  Then, almost certainly, the test set will not be a fair representation of the instances in the training set, or the new ones in production for that matter. For example, say that you have an equally balanced dataset with 5 classes, ordered by class.  Then, taking the first 80% of the dataset instances as your training set, your test set will contain instances of just one class and the training set only instances of the other four classes. So you&#8217;d be testing a model with instances of a class that it never saw during training and therefore the performance measures won&#8217;t be representative of the real performance of the model. This shows that random sampling is important when splitting your data into training and test sets.</p>
<p style="text-align:justify;">When it comes to random sampling, you could use the <a title="A Random Sampling Library for Clojure" href="http://blog.bigml.com/2013/01/22/a-random-sampling-library-for-clojure/">Clojure library that we open-sourced</a> a few months ago, but if you don&#8217;t know Clojure there&#8217;s no need to worry: in BigML&#8217;s interface splitting a dataset into a training and test set is now one click away.</p>
<p style="text-align:center;"><a href="http://littleml.files.wordpress.com/2013/05/1-click-tt2.png"><img class="aligncenter size-full wp-image-4298" alt="1-click-tt2" src="http://littleml.files.wordpress.com/2013/05/1-click-tt2.png?w=497"   /></a></p>
<p style="text-align:justify;">Now, in your dashboard, from the dataset listings or from an individual dataset view you have a new menu option to create a training and test set in only one click.  By default, the split for training and test set is 80/20. That is, 80% of the instances in the dataset will be used to create a new dataset suffixed with &#8220;training&#8221; and the other 20% of the instances will be used to create a new dataset suffixed with &#8220;test&#8221;.</p>
<p style="text-align:justify;">To get the same training/test set split from BigML&#8217;s API two requests are needed.  Both requests need to share exactly the same following arguments:</p>
<ul>
<li><strong>origin_dataset</strong>:  this is a newly created argument that specifies the dataset to split.</li>
<li><strong>seed</strong>: (e.g, “my seed”)  the seed to initialize the random number generator used to shuffle the rows.  It can be any string.  The same seed always gives raise to the same shuffling.</li>
<li><strong>sample_rate</strong>: a number between 0 and 1 that represents the fraction of instances. (e.g., 0.7 for a 70/30 split).</li>
</ul>
<p style="text-align:justify;">The first request is to create the training set.  We have updated the dataset creation REST method to accept a new argument named <strong>origin_dataset</strong> that specifies the dataset to split. You also need to specify the <strong>sample_rate</strong> that to use (e.g., 0.8) and a <strong>seed </strong>(e.g, &#8220;my seed&#8221;).  The seed is essential to guarantee that in the second call the random generator is initialized to exactly the same state, ensuring that the training and test sets are complementary.</p>
<pre class="brush: bash; collapse: false; highlight: [1,2,3,4,5,6,7]; light: true; title: ; wrap-lines: false; notranslate">
curl &quot;https://bigml.io/dataset?$BIGML_AUTH&quot; \
     -X POST \
     -H &quot;content-type: application/json&quot; \
     -d '{&quot;origin_dataset&quot;: &quot;dataset/518d0568925ded7ea40004a2&quot;,
          &quot;sample_rate&quot;: 0.8,
          &quot;seed&quot;: &quot;my seed&quot;,
          &quot;name&quot;: &quot;Training set&quot;}'
</pre>
<p style="text-align:justify;">The second request is to generate the test set. Notice that the <strong>sample_rate</strong> needs to be the same as in the training set creation (i.e., 0.8 and not  0.2 ). The key difference is the flag <strong>out_of_bag </strong>that needs to be set to <strong>true</strong>. This will create a new dataset with all the remaining (out-of-bag) instances that weren&#8217;t used in the training set creation.</p>
<pre class="brush: bash; collapse: false; highlight: [1,2,3,4,5,6,7,8]; light: true; title: ; wrap-lines: false; notranslate">
curl &quot;https://bigml.io/dataset?$BIGML_AUTH&quot; \
     -X POST \
     -H &quot;content-type: application/json&quot;  \
     -d '{&quot;origin_dataset&quot;: &quot;dataset/518d0568925ded7ea40004a2&quot;,
          &quot;sample_rate&quot;: 0.8,
          &quot;seed&quot;: &quot;my seed&quot;,
          &quot;out_of_bag&quot;: true,
          &quot;name&quot;: &quot;Test set&quot;}'
</pre>
<p style="text-align:justify;">Once you have your training and test sets ready, you can use the training set to create a model or ensemble and the test set to evaluate it. If you are satisfied with the results, then remember to use the original dataset (the one with all your data) to create the model or ensemble that you&#8217;ll release to production.  In our next post, we&#8217;ll show you how to split a dataset from BigML&#8217;s command line.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/littleml.wordpress.com/3986/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/littleml.wordpress.com/3986/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=3986&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.bigml.com/2013/05/23/get-your-training-and-test-sets-in-just-one-click/feed/</wfw:commentRss>
		<slash:comments>3</slash:comments>
	
		<media:content url="http://2.gravatar.com/avatar/e790f8968767fdb7e3bf32e76e7102f6?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">franciscojmartin</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/training_test1.jpg" medium="image">
			<media:title type="html">1-click Train &#124; Test dataset splits</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/1-click-tt2.png" medium="image">
			<media:title type="html">1-click-tt2</media:title>
		</media:content>
	</item>
		<item>
		<title>A SunBurst of Insight</title>
		<link>http://blog.bigml.com/2013/05/07/a-sunburst-of-insight/</link>
		<comments>http://blog.bigml.com/2013/05/07/a-sunburst-of-insight/#comments</comments>
		<pubDate>Tue, 07 May 2013 18:11:43 +0000</pubDate>
		<dc:creator>davidgerster</dc:creator>
				<category><![CDATA[Machine Learning]]></category>
		<category><![CDATA[Testimonials]]></category>
		<category><![CDATA[visualization]]></category>
		<category><![CDATA[sunburst]]></category>

		<guid isPermaLink="false">http://blog.bigml.com/?p=3938</guid>
		<description><![CDATA[This is a guest post by David Gerster (@gerster), a data scientist and investor in BigML. I work at a consumer web company, and recently used BigML to understand what drives return visits to our site. I followed Standard Operating Procedure for data mining, sampling a group of users, dividing them into two classes, and creating [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=3938&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p style="text-align:justify;"><strong><em>This is a guest post by <a title="David Gerster " href="http://www.linkedin.com/in/gerster" target="_blank">David Gerster</a> (<a title="David Gerster " href="https://twitter.com/gerster" target="_blank">@gerster</a>), a data scientist and investor in BigML.</em></strong></p>
<p style="text-align:justify;">I work at a consumer web company, and recently used BigML to understand what drives return visits to our site. I followed Standard Operating Procedure for data mining, sampling a group of users, dividing them into two classes, and creating several features that I hoped would be useful in predicting these classes. I then fed this training data to BigML, which quickly and obediently produced a decision tree:</p>
<p><a href="http://littleml.files.wordpress.com/2013/04/decision_tree.png"><img class="aligncenter size-large wp-image-3926" alt="decision_tree" src="http://littleml.files.wordpress.com/2013/04/decision_tree.png?w=497&#038;h=347" width="497" height="347" /></a></p>
<p style="text-align:justify;">Next I used BigML&#8217;s interface to examine the tree&#8217;s many subsets, shown as &#8220;nodes&#8221; in the diagram above. I moused over a node at the top of the tree and saw that it achieved high separation for a large fraction of the training set:</p>
<p style="text-align:center;"><a href="http://littleml.files.wordpress.com/2013/04/screenshot019.gif"><img class=" wp-image-3963 aligncenter" alt="Shhh, I'm hunting nodes!" src="http://littleml.files.wordpress.com/2013/04/screenshot019.gif?w=440&#038;h=288" width="440" height="288" /></a></p>
<p style="text-align:justify;">This one node covered 58% of the data, and separated the two classes with 73% confidence. (&#8220;Confidence&#8221; is a measure of node purity, and for this node 73% of the data belongs to class &#8220;0&#8243;.) With a little more work, I found another node that covered another 22% of the data, this time predicting class &#8220;1&#8243;. For the remaining 20% of data, the best rule I could find (after much mousing-over in the tree) was a single node with a lousy 51% confidence—barely an improvement over flipping a coin. I affectionately named these nodes Rule 1, Rule 2 and Blind Spot.</p>
<p style="text-align:justify;">This is a common use for decision trees: gaining insight by finding the &#8220;best&#8221; nodes as measured by the fraction of data they cover (&#8220;support&#8221;) and their purity (&#8220;confidence&#8221;). When exploring a decision tree for insight, the goal is to find the smallest collection of useful rules that accurately summarizes your data.</p>
<p style="text-align:justify;">Today BigML is launching the SunBurst visualization, which makes it much easier to gain insight from decision trees. Below is a SunBurst viz of my tree: the nodes are now shown as arcs, with the number of radians representing support and the color representing confidence. With a minimum of manual searching, I can easily find Rule 1, Rule 2 and especially the Blind Spot (which, together with its subsets, stands out in ugly, non-predictive brown):</p>
<p style="text-align:center;"><a href="http://littleml.files.wordpress.com/2013/04/screenshot022.gif"><img class=" wp-image-3964 aligncenter" alt="A Burst of Insight!" src="http://littleml.files.wordpress.com/2013/04/screenshot022.gif?w=500" width="500" /></a></p>
<p style="text-align:justify;">Let us ponder the amazing feat this Burst of Sun has achieved. In an eight-dimensional data set of 48,000 instances, I can see immediately which nodes have the highest combination of support and confidence. But wait, there&#8217;s more: I can also see exactly how all of the nodes fit together in a tree hierarchy, which gives me further insight into the data. For example, the upper right of the tree shows several large subsets of Rule 1 that glow bright green, and a closer look reveals that these subsets stack like Russian nesting dolls, each one prettier (but smaller) than its parent. So if I felt that Rule 1 misclassified too many instances, I could easily select one of its prettier children instead, choosing higher confidence at the cost of lower support.</p>
<p style="text-align:justify;">Try out the SunBurst confidence visualization for yourself by training a model, going to the Models tab, and clicking the hypnotic SunBurst icon:</p>
<p style="text-align:justify;"><a href="http://littleml.files.wordpress.com/2013/05/screenshot034.gif"><img class="alignnone size-full wp-image-4095" alt="ScreenShot034" src="http://littleml.files.wordpress.com/2013/05/screenshot034.gif?w=497"   /></a></p>
<p style="text-align:justify;">Then just click the Confidence icon. May all your nodes be green!</p>
<p style="text-align:justify;"><a href="http://littleml.files.wordpress.com/2013/05/screenshot035.gif"><img class="alignnone size-full wp-image-4096" alt="ScreenShot035" src="http://littleml.files.wordpress.com/2013/05/screenshot035.gif?w=497"   /></a></p>
<p style="text-align:justify;">Better visualization obviously does not solve everything, and the usual cautions about understanding your data and validating your model still apply. While the above model is useful (since I haven&#8217;t said anything about the actual data it&#8217;s trained on, you&#8217;ll have to take my word for it), I still want to try a larger data set, and validate the resulting model by splitting the data into training and test. Perhaps most importantly, we cannot assume that two subsets with similar confidence but different predicted classes (like Rule 1 and Rule 2) will make equally valid rules in practice, since a false positive could be much, much worse than a false negative, or vice versa.</p>
<p style="text-align:justify;">Nonetheless, the SunBurst is a huge leap forward in decision tree visualization. BigML&#8217;s Adam Ashenfelter <a title="A New Way to Visualize Decision Trees" href="http://blog.bigml.com/2013/04/19/a-new-way-to-visualize-decision-trees/" target="_blank">explains</a> that the SunBurst &#8220;may not be as intuitive as our regular tree view&#8221;, and he&#8217;s right—it&#8217;s about a thousand times more intuitive than the regular tree view. You can have my SunBurst when you pry it from my cold, dead retinas.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/littleml.wordpress.com/3938/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/littleml.wordpress.com/3938/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=3938&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.bigml.com/2013/05/07/a-sunburst-of-insight/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/77189abb429873e2386f9e37ef17c860?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">davidgerster</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/04/decision_tree.png?w=497" medium="image">
			<media:title type="html">decision_tree</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/04/screenshot019.gif" medium="image">
			<media:title type="html">Shhh, I&#039;m hunting nodes!</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/04/screenshot022.gif" medium="image">
			<media:title type="html">A Burst of Insight!</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/screenshot034.gif" medium="image">
			<media:title type="html">ScreenShot034</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/screenshot035.gif" medium="image">
			<media:title type="html">ScreenShot035</media:title>
		</media:content>
	</item>
		<item>
		<title>Alternative Keys: Fine-grained, REST API Access to Your Machine Learning Resources</title>
		<link>http://blog.bigml.com/2013/05/03/alternative-keys-fine-grained-rest-api-access-to-your-machine-learning-resources/</link>
		<comments>http://blog.bigml.com/2013/05/03/alternative-keys-fine-grained-rest-api-access-to-your-machine-learning-resources/#comments</comments>
		<pubDate>Fri, 03 May 2013 18:58:43 +0000</pubDate>
		<dc:creator>osroca</dc:creator>
				<category><![CDATA[API]]></category>
		<category><![CDATA[BigML.io]]></category>
		<category><![CDATA[New Features]]></category>

		<guid isPermaLink="false">http://blog.bigml.com/?p=3981</guid>
		<description><![CDATA[Accessing BigML via our REST API is easy, requiring only a username and an API Key. Every account registered with BigML automatically gets a master API Key which has full access to all capabilities within your account.  That is, with the master key you can programmatically create, retrieve, update or delete sources, datasets, models, ensembles, predictions, and [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=3981&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p style="text-align:justify;">Accessing BigML via our REST API is easy, requiring only a username and an API Key. Every account registered with BigML automatically gets a master API Key which has full access to all capabilities within your account.  That is, with the master key you can programmatically create, retrieve, update or delete <a title="BigML Sources" href="https://bigml.com/developers/sources">sources</a>, <a title="BigML datasets" href="https://bigml.com/developers/datasets">datasets</a>, <a title="BigML models" href="https://bigml.com/developers/models">models</a>, <a title="BigML ensembles" href="https://bigml.com/developers/ensembles">ensembles</a>, <a title="BigML predictions" href="https://bigml.com/developers/predictions">predictions</a>, and <a title="BigML evaluations" href="https://bigml.com/developers/evaluations">evaluations</a>, all via the <a title="Fly your ML-Cloud like a kite with BigMLer: the command-line tool for Machine Learning" href="http://blog.bigml.com/2013/01/31/fly-your-ml-cloud-like-a-kite-with-bigmler-the-command-line-tool-for-machine-learning/">command line</a>, any of the <a title="Machine Learning in Python Has Never Been Easier!" href="http://blog.bigml.com/2012/05/04/machine-learning-in-python-has-never-been-easier/">API bindings</a> that <a title="(+ clojure big-ml)" href="http://blog.bigml.com/2013/01/17/clojure-big-ml/">we</a> or <a title="Machine Learning in Java has never been easier!" href="http://blog.bigml.com/2012/06/07/machine-learning-in-java-has-never-been-easier/">our</a> <a title="Democratizing Machine Learning With C#" href="http://blog.bigml.com/2013/03/06/democratizing-machine-learning-with-c/">fans</a> <a title="Machine Learning on Rails with Ruby!" href="http://blog.bigml.com/2012/07/06/machine-learning-on-rails-with-ruby/">have</a> <a title="Your Smartphone Just Got Smarter" href="http://blog.bigml.com/2012/06/05/your-smartphone-just-got-smarter/">been</a> <a title="R you ready for Big Machine Learning?" href="http://blog.bigml.com/2012/05/10/r-you-ready-for-bigml/">developing</a>, or your own private implementation.</p>
<p style="text-align:justify;">We even make finding and using the API Key easy. BigML&#8217;s web interface provides an icon for each resource that lets you get its URL with the api key already encoded, allowing you to access the resource directly from within your application.</p>
<p style="text-align:justify;"><a href="http://littleml.files.wordpress.com/2013/05/apikeyandresourceid.png"><img alt="APIKeyandresourceid" src="http://littleml.files.wordpress.com/2013/05/apikeyandresourceid.png?w=497&#038;h=311" width="497" height="311" /></a></p>
<p style="text-align:justify;">However, although the power of your master API Key makes working with BigML&#8217;s API easy, it also comes with potential risk. There is no way to share access to your resources in a limited way, and if you do share your master API Key, then you are granting access to every capability in your account. The only method to mitigate this risk previously was the ability to recreate your master key on demand:</p>
<p><a href="http://littleml.files.wordpress.com/2013/05/recreateapikey.png"><img class="aligncenter size-large wp-image-4006" alt="recreateapikey" src="http://littleml.files.wordpress.com/2013/05/recreateapikey.png?w=497&#038;h=106" width="497" height="106" /></a></p>
<p style="text-align:justify;">In order to address this limitation, our <a title="1-click Random Decision Forests" href="http://blog.bigml.com/2013/04/29/1-click-random-decision-forests/">latest release</a> brings the ability to add <strong>Alternative API Keys</strong> to your account with finer grained controls.  You can define what resources a key can access and what operations (i.e., <strong>create</strong>, <strong>list</strong>, <strong>retrieve</strong>, <strong>update</strong> or <strong>delete</strong>) are allowed with it. This is useful in scenarios where you want to grant different roles and privileges to different applications. For example, an application for the IT folks that collects data and <strong>creates</strong> sources in BigML, another that is accessed by data scientists to <strong>create</strong> and <strong>evaluate</strong> models, and a third that is used by the marketing folks to <strong>create</strong> predictions.</p>
<p style="text-align:justify;"><a href="http://littleml.files.wordpress.com/2013/05/createnewapikey.png"><img class="aligncenter size-large wp-image-3999" alt="CreateNewAPIKey" src="http://littleml.files.wordpress.com/2013/05/createnewapikey.png?w=497&#038;h=534" width="497" height="534" /></a></p>
<p style="text-align:justify;">We have implemented some logic behind the scenes to ensure that the permissions you assign are sound. For example, if you want a key to be able to <strong>create</strong> models, it must also be able to <strong>read</strong> datasets and models; similarly, if you want your API key to be able to <strong>create</strong> evaluations it must be able to <strong>read</strong> datasets, models, and also evaluations.</p>
<p><a href="http://littleml.files.wordpress.com/2013/05/alternativekeys.png"><img class="aligncenter size-large wp-image-4000" alt="AlternativeKeys" src="http://littleml.files.wordpress.com/2013/05/alternativekeys.png?w=497&#038;h=314" width="497" height="314" /></a></p>
<p style="text-align:justify;">If you give <strong>Alternative API Keys</strong> a try please let us know what you think, especially if there is anything we could improve to make it more useful. We appreciate your feedback and are available to help!</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/littleml.wordpress.com/3981/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/littleml.wordpress.com/3981/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=3981&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.bigml.com/2013/05/03/alternative-keys-fine-grained-rest-api-access-to-your-machine-learning-resources/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://2.gravatar.com/avatar/b6d6e5fb79875f971349005846d78f95?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">osroca</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/apikeyandresourceid.png?w=497" medium="image">
			<media:title type="html">APIKeyandresourceid</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/recreateapikey.png?w=497" medium="image">
			<media:title type="html">recreateapikey</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/createnewapikey.png?w=497" medium="image">
			<media:title type="html">CreateNewAPIKey</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/05/alternativekeys.png?w=497" medium="image">
			<media:title type="html">AlternativeKeys</media:title>
		</media:content>
	</item>
		<item>
		<title>1-click Random Decision Forests</title>
		<link>http://blog.bigml.com/2013/04/29/1-click-random-decision-forests/</link>
		<comments>http://blog.bigml.com/2013/04/29/1-click-random-decision-forests/#comments</comments>
		<pubDate>Mon, 29 Apr 2013 21:34:17 +0000</pubDate>
		<dc:creator>petersp</dc:creator>
				<category><![CDATA[Ensembles]]></category>
		<category><![CDATA[New Features]]></category>
		<category><![CDATA[Bagging]]></category>
		<category><![CDATA[Random Decision Forest]]></category>

		<guid isPermaLink="false">http://blog.bigml.com/?p=3876</guid>
		<description><![CDATA[One of the pitfalls of machine learning is that creating a single predictive model has the potential to overfit your data. That is, the performance on your training data might be very good, but the model does not generalize well to new data. Ensemble learning of decision trees, also referred to as forests or simply ensembles,  is a [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=3876&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p style="text-align:justify;">One of the pitfalls of machine learning is that creating a single predictive model has the potential to overfit your data. That is, the performance on your training data might be very good, but the model does not generalize well to new data. Ensemble learning of decision trees, also referred to as forests or simply <strong>ensembles</strong>,  is a tried-and-true technique for reducing the error of single machine-learned models. By learning <a title="The Rewards of Ignoring Data" href="http://blog.bigml.com/2012/12/14/the-rewards-of-ignoring-data/">multiple models over different subsamples of your data</a> and taking a majority vote at prediction time, the risk of overfitting a single model to all of the data is mitigated. You can read more about this in our <a title="The Three Cardinal Virtues of Ensemble Learning" href="http://blog.bigml.com/2013/04/04/the-three-cardinal-virtues-of-ensemble-learning/">previous post</a>.</p>
<p style="text-align:justify;">Early this year, we showed how BigML <strong>ensembles</strong> outperform their solo counterparts and even <a title="Machine Learning Throwdown:  The Reckoning" href="http://blog.bigml.com/2013/01/04/machine-learning-throwdown-the-reckoning/">beat other machine learning services</a>. However, up until now creating <strong>ensembles</strong> with BigML has only been <a title="The Rewards of Ignoring Data" href="http://blog.bigml.com/2012/12/14/the-rewards-of-ignoring-data/">available via our API</a>. We are excited to announce that <strong>ensembles</strong> are now available via our web interface and that they have also become <a title="BigML ensembles" href="https://bigml.com/developers/ensembles">first-class citizens in our API</a>.</p>
<h1>Creating Ensembles</h1>
<p style="text-align:justify;">You can create an <strong>ensemble</strong> just as you would create a <strong>model</strong>, with the addition of three optional parameters:</p>
<ol>
<li style="text-align:justify;"><span style="line-height:13px;">Whether your want <strong>fields</strong> to be selected randomly at each split (i.e., <strong>decision forest</strong>) or only <strong>bagging</strong> to be used. </span></li>
<li>The <strong>number of models</strong>.</li>
<li>The <strong>task level parallelism</strong>.</li>
</ol>
<h3><a href="http://littleml.files.wordpress.com/2013/04/create_ensemble.png"><img class="aligncenter size-large wp-image-3879" alt="create_ensemble" src="http://littleml.files.wordpress.com/2013/04/create_ensemble.png?w=497&#038;h=186" width="497" height="186" /></a></h3>
<h2>Decision Forest</h2>
<p style="text-align:justify;">A Decision forest or <a title="Random decision forest" href="http://en.wikipedia.org/wiki/Random_forest" target="_blank">random decision forest</a> is created by selecting a random set of the input fields at each split node while an individual model in the <strong>ensemble</strong> is being built instead of considering all the input fields. This is the strategy that BigML uses by default. If you just want to use bagging you should deselect this option.</p>
<h2>Bagging</h2>
<p style="text-align:justify;"><a title="Bagging" href="http://en.wikipedia.org/wiki/Bootstrap_aggregating" target="_blank">Bagging</a>, also known as bootstrap aggregating, is one of the simplest <strong>ensemble</strong>-based strategies but often outperforms strategies that are more complex. This method uses a different random subset of the original dataset for each model in the ensemble. By default, BigML uses a sampling rate of 100% with replacement for each model, meaning that individual instances can be selected more than once from the dataset. You can select different sampling rates using the sampling configuration panels.</p>
<h2>Number of Models</h2>
<p style="text-align:justify;">The default is ten, but depending on your data and other modeling parameters you might want to use a bigger number. Generally, increasing the number of models in an <strong>ensemble</strong> lowers the effect of noise and model variability, and has no downside except the additional cost to you, the user. The cases where more models are likely to be beneficial are when the data is not terribly large (in the thousands of instances or less), when the data is very noisy, and for random decision forests, when there are many correlated features that are all at least somewhat useful.</p>
<p style="text-align:justify;">Keep in mind that each additional model tends to deliver decreasing marginal improvement, so if the difference between nine and ten models is very small, it is very unlikely that an eleventh model will make a big difference.</p>
<h2>Task Level Parallelism</h2>
<p style="text-align:justify;">The <em><strong>task level parallelism</strong></em> is the level of parallelism that BigML will use to perform a task that is decomposable into embarrassingly parallel tasks like building the models of a random decision forest. We offer five different levels. In the lowest, sub-tasks will be performed sequentially and in the highest level up to 16 sub-tasks will be performed in parallel. The higher the level, the faster your <strong>ensemble</strong> will be finished. However, the more credits it will cost you.</p>
<h1 style="text-align:center;"><a href="http://littleml.files.wordpress.com/2013/04/ensemble1.png"><img alt="ensemble" src="http://littleml.files.wordpress.com/2013/04/ensemble1.png?w=497&#038;h=406" width="497" height="406" /></a></h1>
<h2>1-click Ensemble</h2>
<p style="text-align:justify;">You can also create an ensemble in just one click.  By default, a 1-click ensemble will create a <strong>random decision forest</strong> of <strong>10 models</strong> using <strong>100%</strong> of the original dataset but <strong>sampling it with replacement</strong>.</p>
<h1><a href="http://littleml.files.wordpress.com/2013/04/1-click-ensemble.png"><img class="aligncenter size-full wp-image-3913" alt="1-click-ensemble" src="http://littleml.files.wordpress.com/2013/04/1-click-ensemble.png?w=497"   /></a></h1>
<h1>Predicting with Ensembles</h1>
<p style="text-align:justify;">Once your <strong>ensemble</strong> is finished, creating a <strong>prediction</strong> is the same as creating a <strong>prediction</strong> with a single <strong>model</strong>, with one additional step; the predictions from the individual models of the ensemble must be combined into a final prediction. The default method for combining the predictions is pluarity vote for a classification <strong>ensemble</strong> and a simple average for a regression <strong>ensemble</strong>.</p>
<p style="text-align:justify;"><a href="http://littleml.files.wordpress.com/2013/04/ensemble_prediction1.png"><img class="aligncenter size-large wp-image-3896" alt="ensemble_prediction" src="http://littleml.files.wordpress.com/2013/04/ensemble_prediction1.png?w=497&#038;h=370" width="497" height="370" /></a></p>
<p style="text-align:justify;">BigML offers three different methods for combining the predictions of an <strong>ensemble</strong> :</p>
<ol>
<li style="text-align:justify;"><strong><strong><span style="line-height:13px;">Plurality - </span></strong></strong>weighs each model’s prediction as one vote for classification <strong>ensembles</strong>. For regression <strong>ensembles</strong>, the predictions are averaged.</li>
<li style="text-align:justify;"><strong>Confidence Weighted</strong> - uses each prediction&#8217;s confidence as a voting weight for classification <strong>ensembles</strong>. For regression <strong>ensembles</strong>, computes a weighted average using the associated error as the weight.</li>
<li style="text-align:justify;"><strong>Probability Weighted</strong> - uses the probability of the class in the distribution of classes in the leaf node of each prediction as a voting weight for classification <strong>ensembles</strong>. For regression <strong>ensembles</strong>, this method is equivalent to the <strong>plurality </strong>method above.</li>
</ol>
<p style="text-align:justify;"><span style="line-height:13px;"><span style="line-height:19px;">Predictions take longer in <strong>ensembles</strong> than in single models, but you can also download <strong>ensembles</strong> using our <em>download actionable ensemble</em> button to perform low latency predictions directly in your applications. So far they are only available in <a title="Python bindings" href="https://github.com/bigmlcom/python">Python</a> but we&#8217;ll bring them to more programming languages soon. We also plan to bring high-performance predictions in an upcoming release, so stay tuned.</span></span></p>
<h1>Evaluating Ensembles</h1>
<p style="text-align:justify;">You can also evaluate an <strong>ensemble</strong> in the same way as a single <strong>model</strong>.</p>
<p style="text-align:justify;"><a href="http://littleml.files.wordpress.com/2013/04/ensemble_evaluation1.png"><img class="aligncenter size-large wp-image-3894" alt="ensemble_evaluation" src="http://littleml.files.wordpress.com/2013/04/ensemble_evaluation1.png?w=497&#038;h=392" width="497" height="392" /></a></p>
<p style="text-align:justify;">The level of accuracy achieved by ensembles of decision trees on previously unseen data very often outperforms most other techniques even if they are more sophisticated or complex.  Not surprisingly, then, it is common to find random decision forests as one of the top performers in <a href="http://www.slate.com/articles/health_and_science/new_scientist/2012/12/kaggle_president_jeremy_howard_amateurs_beat_specialists_in_data_prediction.html" target="_blank">Kaggle&#8217;s competitions</a>.  Finally, <strong>ensembles</strong> of decision trees can be applied to perform a multitude of tasks such as classification, regression, manifold learning, density estimation, and semi-supervised classification in thousands of real-world domains. If you&#8217;re interested in a great monograph about random decision forests, we recommend <a title="Decision Forests (Criminisi and Shotton) " href="http://www.amazon.com/Decision-Computer-Analysis-Advances-Recognition/dp/1447149289" target="_blank">this book</a>.</p>
<p style="text-align:justify;">We hope that you give BigML <strong>ensembles</strong> a try and let us know about your experience and results. Moreover, in this new release there are a number of small goodies like <em><strong>1-click training|test split</strong></em>, <em><strong>alternative API keys</strong></em> to access to your BigML resources with different privileges, <em><strong>comparing evaluations</strong></em>, and many other things under the hood to make everything come together. We&#8217;ll explain it all in future blog posts!</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/littleml.wordpress.com/3876/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/littleml.wordpress.com/3876/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.bigml.com&#038;blog=30283844&#038;post=3876&#038;subd=littleml&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.bigml.com/2013/04/29/1-click-random-decision-forests/feed/</wfw:commentRss>
		<slash:comments>5</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/d022e5d3c9b475f44f1cb45ed48e5975?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">petersp</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/04/create_ensemble.png?w=497" medium="image">
			<media:title type="html">create_ensemble</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/04/ensemble1.png?w=497" medium="image">
			<media:title type="html">ensemble</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/04/1-click-ensemble.png" medium="image">
			<media:title type="html">1-click-ensemble</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/04/ensemble_prediction1.png?w=497" medium="image">
			<media:title type="html">ensemble_prediction</media:title>
		</media:content>

		<media:content url="http://littleml.files.wordpress.com/2013/04/ensemble_evaluation1.png?w=497" medium="image">
			<media:title type="html">ensemble_evaluation</media:title>
		</media:content>
	</item>
	</channel>
</rss>
