<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>cashplk的心路历程 &#187; 去重</title>
	<atom:link href="http://cashplk.com/tag/%e5%8e%bb%e9%87%8d/feed/" rel="self" type="application/rss+xml" />
	<link>http://cashplk.com</link>
	<description>学无止境，梦如夏花～</description>
	<lastBuildDate>Sat, 31 Jul 2010 10:15:57 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.0.1</generator>
		<item>
		<title>使用Ruby分析文件＆去重</title>
		<link>http://cashplk.com/2009/06/24/%e4%bd%bf%e7%94%a8ruby%e5%88%86%e6%9e%90%e6%96%87%e4%bb%b6%ef%bc%86%e5%8e%bb%e9%87%8d/</link>
		<comments>http://cashplk.com/2009/06/24/%e4%bd%bf%e7%94%a8ruby%e5%88%86%e6%9e%90%e6%96%87%e4%bb%b6%ef%bc%86%e5%8e%bb%e9%87%8d/#comments</comments>
		<pubDate>Wed, 24 Jun 2009 03:14:16 +0000</pubDate>
		<dc:creator>cashplk</dc:creator>
				<category><![CDATA[Ruby]]></category>
		<category><![CDATA[去重]]></category>
		<category><![CDATA[分析]]></category>
		<category><![CDATA[uniq]]></category>

		<guid isPermaLink="false">http://cashplk.com/?p=62</guid>
		<description><![CDATA[需要分析一个18M的文件，其中内容类似如下： 2009-06-20 00:00:07,678 [5409241381(T,105101,**,N,**,18ms)(12345678901234567890,dummy,**,888888,-,1000.00,**,-,005004-**,5409241381,0,null)] 文件以=== All done!!! ===结束。 全部日志加起来有7W多条，由于是时间程序批量请求的，所以其中肯定有很多重复的数据。我需要的只是那个12345678901234567890的20位数字而已。 so,尝试一下Ruby的脚本能力。 file = File.open&#40;result, 'r'&#41; ＃ 读取文件 &#160; ＃ 使用正则表达式匹配20个数字 regex = /\d&#123;20&#125;/ &#160; $array = Array.new&#40;&#41;; &#160; while&#40;line = file.gets&#41; ＃ 读取文件 $array.push regex.match&#40;line&#41;.to_s break if line == '=== All done!!! ===' end file.close &#160; # 去重 $resultFile.puts&#40;$array.uniq&#41; puts 'convert ends!!']]></description>
			<content:encoded><![CDATA[<p>需要分析一个18M的文件，其中内容类似如下：</p>
<p>2009-06-20 00:00:07,678 [5409241381(T,105101,**,N,**,18ms)(12345678901234567890,dummy,**,888888,-,1000.00,**,-,005004-**,5409241381,0,null)]</p>
<p>文件以=== All done!!! ===结束。</p>
<p>全部日志加起来有7W多条，由于是时间程序批量请求的，所以其中肯定有很多重复的数据。我需要的只是那个12345678901234567890的20位数字而已。<br />
so,尝试一下Ruby的脚本能力。</p>

<div class="wp_syntax"><div class="code"><pre class="ruby" style="font-family:monospace;">  file = <span style="color:#CC00FF; font-weight:bold;">File</span>.<span style="color:#CC0066; font-weight:bold;">open</span><span style="color:#006600; font-weight:bold;">&#40;</span>result, <span style="color:#996600;">'r'</span><span style="color:#006600; font-weight:bold;">&#41;</span> ＃ 读取文件
&nbsp;
＃ 使用正则表达式匹配<span style="color:#006666;">20</span>个数字
regex = <span style="color:#006600; font-weight:bold;">/</span>\d<span style="color:#006600; font-weight:bold;">&#123;</span><span style="color:#006666;">20</span><span style="color:#006600; font-weight:bold;">&#125;</span><span style="color:#006600; font-weight:bold;">/</span>
&nbsp;
<span style="color:#ff6633; font-weight:bold;">$array</span> = <span style="color:#CC0066; font-weight:bold;">Array</span>.<span style="color:#9900CC;">new</span><span style="color:#006600; font-weight:bold;">&#40;</span><span style="color:#006600; font-weight:bold;">&#41;</span>;
&nbsp;
  <span style="color:#9966CC; font-weight:bold;">while</span><span style="color:#006600; font-weight:bold;">&#40;</span>line = file.<span style="color:#CC0066; font-weight:bold;">gets</span><span style="color:#006600; font-weight:bold;">&#41;</span>  ＃ 读取文件
    <span style="color:#ff6633; font-weight:bold;">$array</span>.<span style="color:#9900CC;">push</span> regex.<span style="color:#9900CC;">match</span><span style="color:#006600; font-weight:bold;">&#40;</span>line<span style="color:#006600; font-weight:bold;">&#41;</span>.<span style="color:#9900CC;">to_s</span>
    <span style="color:#9966CC; font-weight:bold;">break</span> <span style="color:#9966CC; font-weight:bold;">if</span> line == <span style="color:#996600;">'=== All done!!! ==='</span>
  <span style="color:#9966CC; font-weight:bold;">end</span>
  file.<span style="color:#9900CC;">close</span>
&nbsp;
<span style="color:#008000; font-style:italic;"># 去重</span>
<span style="color:#ff6633; font-weight:bold;">$resultFile</span>.<span style="color:#CC0066; font-weight:bold;">puts</span><span style="color:#006600; font-weight:bold;">&#40;</span>$array.<span style="color:#9900CC;">uniq</span><span style="color:#006600; font-weight:bold;">&#41;</span>
<span style="color:#CC0066; font-weight:bold;">puts</span> <span style="color:#996600;">'convert ends!!'</span></pre></div></div>

]]></content:encoded>
			<wfw:commentRss>http://cashplk.com/2009/06/24/%e4%bd%bf%e7%94%a8ruby%e5%88%86%e6%9e%90%e6%96%87%e4%bb%b6%ef%bc%86%e5%8e%bb%e9%87%8d/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
	</channel>
</rss>
<!-- WP Super Cache is installed but broken. The path to wp-cache-phase1.php in wp-content/advanced-cache.php must be fixed! -->