gem install -f scrubyt
--- inline.rb.orig 2008-06-21 00:39:00 +0900 +++ inline.rb 2008-06-21 00:39:00 +0900 @@ -253,7 +253,8 @@ 0 end - file, line = caller[1].split(/:/) +# file, line = caller[1].split(/:/) + file, line = caller[1].scan(/(.+):(\d+)(?=$|:)/).first result = "# line #{line.to_i + delta} \"#{file}\"\n" + result unless $DEBUG and not $TESTING @src << result @@ -295,7 +296,8 @@ raise "Couldn't discover caller" if stack.empty? real_caller = stack.first real_caller = stack[3] if real_caller =~ /\(eval\)/ - real_caller = real_caller.split(/:/, 3)[0..1] +# real_caller = real_caller.split(/:/, 3)[0..1] + real_caller = real_caller.scan(/(.+):(\d+)(?=$|:)/).first @real_caller = real_caller.join ':' @rb_file = File.expand_path real_caller.first @@ -429,7 +431,7 @@ '-I', hdrdir, config_hdrdir, '-I', Config::CONFIG['includedir'], - "-L#{Config::CONFIG['libdir']}", +# "-L#{Config::CONFIG['libdir']}", '-o', so_name.inspect, File.expand_path(src_name).inspect, libs,
s.add_dependency(%q<RubyInline>, [">= 3.6.3"])
--- parse_tree.rb.orig 2008-06-15 11:19:27 +0900 +++ parse_tree.rb 2008-11-08 15:25:27 +0900 @@ -288,7 +288,7 @@ # 1) Get me a login on your box so I can repro this and get it fixed. # 2) Fix it and send me the patch # 3) (quick, but dirty and bad), comment out the following line: - builder.add_compile_flags "-Werror" +# builder.add_compile_flags "-Werror" builder.prefix %{ #define nd_3rd u3.node @@ -1051,7 +1051,7 @@ } } - builder.prefix " extern NODE *ruby_eval_tree_begin; " \ + builder.prefix " RUBY_EXTERN NODE *ruby_eval_tree_begin; " \ if RUBY_VERSION < '1.9.0' builder.c %Q{
--- config.h.orig 2007-03-17 11:54:54 +0900 +++ config.h 2008-06-14 18:03:07 +0900 @@ -1,6 +1,8 @@ +/* #if _MSC_VER != 1200 #error MSC version unmatch #endif +*/ #define STDC_HEADERS 1 #define HAVE_SYS_TYPES_H 1 #define HAVE_SYS_STAT_H 1
rem vcvars32.bat - vc7tk rem @echo off rem call D:\EXPF\Mingw\cmd.bat set VC7TKInstallDir=D:\EXPF\Microsoft Visual C++ Toolkit 2003 set VC7TKMSVSDir=%ProgramFiles%\Microsoft Visual Studio .NET 2003\Vc7 Set PATH=%VC7TKInstallDir%\bin;%PATH% Set INCLUDE=%VC7TKInstallDir%\include;%VC7TKMSVSDir%\include;%INCLUDE% Set LIB=%VC7TKInstallDir%\lib;%VC7TKMSVSDir%\lib;%LIB% rem psdk set PSDKInstallDir=D:\EXPF\PSDK4WS2003R2 call %PSDKInstallDir%\SetEnv.Cmd /2000 %* set INCLUDE=%MSSdk%\include\mfc;%INCLUDE% set INCLUDE=%MSSdk%\include\atl;%INCLUDE% set INCLUDE=%MSSdk%\include\crt;%INCLUDE%
" -link /LIBPATH:\"#{Config::CONFIG['libdir']}\" /DEFAULTLIB:\"#{Config::CONFIG['LIBRUBY']}\" /INCREMENTAL:no /EXPORT:Init_#{module_name}" when /mingw32/ then
--- rbconfig.rb.orig 2008-05-15 16:17:22 +0900 +++ rbconfig.rb 2008-06-22 12:57:45 +0900 @@ -22,12 +22,12 @@ CONFIG["SHELL"] = "$(COMSPEC)" CONFIG["BUILD_FILE_SEPARATOR"] = "\\" CONFIG["PATH_SEPARATOR"] = ";" - CONFIG["CFLAGS"] = "-MD -Zi -O2b2xg- -G6" + CONFIG["CFLAGS"] = "-O2 -mtune=pentium3" CONFIG["CPPFLAGS"] = "" CONFIG["CXXFLAGS"] = "" CONFIG["FFLAGS"] = "" CONFIG["LDFLAGS"] = "" - CONFIG["LIBS"] = "oldnames.lib user32.lib advapi32.lib wsock32.lib " + CONFIG["LIBS"] = "-luser32 -ladvapi32 -lwsock32 " CONFIG["exec_prefix"] = "$(prefix)" CONFIG["bindir"] = "$(exec_prefix)/bin" CONFIG["sbindir"] = "$(exec_prefix)/sbin" @@ -56,8 +56,8 @@ CONFIG["target_cpu"] = "i386" CONFIG["target_vendor"] = "pc" CONFIG["target_os"] = "mswin32" - CONFIG["CC"] = "cl -nologo" - CONFIG["CPP"] = "cl -nologo -E" + CONFIG["CC"] = "gcc" + CONFIG["CPP"] = "gcc -E" CONFIG["YACC"] = "byacc" CONFIG["RANLIB"] = "" CONFIG["AR"] = "lib -nologo" @@ -67,14 +67,14 @@ CONFIG["CP"] = "copy > nul" CONFIG["ALLOCA"] = "" CONFIG["DEFAULT_KCODE"] = "" - CONFIG["OBJEXT"] = "obj" + CONFIG["OBJEXT"] = "o" CONFIG["XCFLAGS"] = "-DRUBY_EXPORT -I. -I./.. -I./../missing" CONFIG["XLDFLAGS"] = "-stack:0x2000000" CONFIG["DLDFLAGS"] = "-link -incremental:no -debug -opt:ref -opt:icf -dll $(LIBPATH) -def:$(DEFFILE) -implib:$(*F:.so=)-$(arch).lib -pdb:$(*F:.so=)-$(arch).pdb" CONFIG["ARCH_FLAG"] = "" CONFIG["STATIC"] = "" CONFIG["CCDLFLAGS"] = "" - CONFIG["LDSHARED"] = "cl -nologo -LD" + CONFIG["LDSHARED"] = "gcc -shared" CONFIG["DLEXT"] = "so" CONFIG["DLEXT2"] = "dll" CONFIG["LIBEXT"] = "lib"
diff -r -u scrubyt-0.3.4.orig/lib/scrubyt/core/scraping/filters/tree_filter.rb scrubyt-0.3.4/lib/scrubyt/core/scraping/filters/tree_filter.rb --- scrubyt-0.3.4.orig/lib/scrubyt/core/scraping/filters/tree_filter.rb 2008-06-21 00:52:39 +0900 +++ scrubyt-0.3.4/lib/scrubyt/core/scraping/filters/tree_filter.rb 2008-06-21 00:52:39 +0900 @@ -112,7 +112,7 @@ child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) : XPathUtils.generate_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) end - break if @parent_pattern.children[0].filters.size == current_example_index + 1 + break if @parent_pattern.children[0].filters.size == current_example_index + 1 || @parent_pattern.children[0].nil? current_example_index += 1 end when EXAMPLE_TYPE_IMAGE diff -r -u scrubyt-0.3.4.orig/lib/scrubyt/core/shared/extractor.rb scrubyt-0.3.4/lib/scrubyt/core/shared/extractor.rb --- scrubyt-0.3.4.orig/lib/scrubyt/core/shared/extractor.rb 2008-06-21 00:52:39 +0900 +++ scrubyt-0.3.4/lib/scrubyt/core/shared/extractor.rb 2008-06-21 00:52:39 +0900 @@ -33,7 +33,7 @@ @processed_pages = [] backtrace = SharedUtils.get_backtrace - parts = backtrace[1].split(':') + parts = backtrace[1].scan(/(.+):(\d+)(?=$|:)/).first source_file = parts[0] Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
#!ruby -Ks require 'rubygems' require 'scrubyt' # scrubyt.rb が $KCODE = 'u' してるので再設定する。 $KCODE = "s" class CorporateActionSplit def self.parse( iPath ) Scrubyt::Extractor.define do #Perform the action(s) fetch( iPath ) #Construct the wrapper title( '//div[@class="title-text"]' ) lastmodified( '//table[@width="550"][@cellspacing="0"][@cellpadding="1"]/' + 'tr[1]/td[@class="mtext"][@align="right"]' ) header( '//table[@width="550"][@cellspacing="1"][@cellpadding="1"]/' + 'tr[1]' ) do name( '/td[@class="mtext-db">]' ) end row( '//table[@width="550"]/tr[@*]' ) do date( '/td[@class="mtext"][@bgcolor="#ffffff"][1]' ) brand( '/td[@class="mtext"][@bgcolor="#ffffff"][2]' ) market( '/td[@class="mtext"][@bgcolor="#ffffff"][3]' ) size( '/td[@class="mtext"][@bgcolor="#ffffff"][4]' ) end end end def self.scraping( iPath, iClass = CorporateActionSplit ) # todo result = iClass.parse( iPath ) puts result.to_xml end end # class CorporateActionSplit class CorporateActionAdjust def self.parse( iPath ) Scrubyt::Extractor.define do #Perform the action(s) fetch( iPath ) #Construct the wrapper title( '//div[@class="title-text"]' ) lastmodified( '//table[@width="550"][@cellspacing="0"][@cellpadding="1"]/' + 'tr[1]/td[@class="mtext"][@align="right"]' ) header( '//table[@width="550"][@cellspacing="1"][@cellpadding="1"]/' + 'tr[1]' ) do name( '/td[@class="mtext-db">]' ) end row( '//table[@width="550"]/tr[@*]' ) do date( '/td[@class="mtext"][@bgcolor="#ffffff"][1]' ) brand( '/td[@class="mtext"][@bgcolor="#ffffff"][2]' ) market( '/td[@class="mtext"][@bgcolor="#ffffff"][3]' ) quart( '/td[@class="mtext"][@bgcolor="#ffffff"][4]' ) prev( '/td[@class="mtext"][@bgcolor="#ffffff"][5]' ) curr( '/td[@class="mtext"][@bgcolor="#ffffff"][6]' ) ratio( '/td[@class="mtext"][@bgcolor="#ffffff"][7]' ) end end end def self.scraping( iPath, iClass = CorporateActionAdjust ) # todo result = iClass.parse( iPath ) puts result.to_xml end end # class CorporateActionAdjust class CorporateActionUnit def self.parse( iPath ) Scrubyt::Extractor.define do #Perform the action(s) fetch( iPath ) #Construct the wrapper title( '//div[@class="title-text"]' ) lastmodified( '//table[@width="550"][@cellspacing="0"][@cellpadding="1"]/' + 'tr[1]/td[@class="mtext"][@align="right"]' ) header( '//table[@width="550"][@cellspacing="1"][@cellpadding="1"]/' + 'tr[1]' ) do name( '/td[@class="mtext-db">]' ) end row( '//table[@width="550"]/tr[@*]' ) do date( '/td[@class="mtext"][@bgcolor="#ffffff"][1]' ) brand( '/td[@class="mtext"][@bgcolor="#ffffff"][2]' ) market( '/td[@class="mtext"][@bgcolor="#ffffff"][3]' ) prev( '/td[@class="mtext"][@bgcolor="#ffffff"][4]' ) curr( '/td[@class="mtext"][@bgcolor="#ffffff"][5]' ) end end end def self.scraping( iPath, iClass = CorporateActionUnit ) # todo result = iClass.parse( iPath ) puts result.to_xml end end # class CorporateActionUnit base = File.dirname( $0 ) path = File.join( base, "spl.htm" ) result = CorporateActionSplit.scraping( path ) path = File.join( base, "uni.htm" ) result = CorporateActionUnit.scraping( path ) path = File.join( base, "adj.htm" ) result = CorporateActionAdjust.scraping( path )
class CorporateActionSplit def self.parse( iPath ) Scrubyt::Extractor.define do #Perform the action(s) fetch( iPath ) #Construct the wrapper title( '//div[@class="title-text"]' ) lastmodified( '//table[@width="550"][@cellspacing="0"][@cellpadding="1"/tr[1]/td[@class="mtext"][@align="right"]' ) row( '//table[@width="550"][@cellspacing="1"][@cellpadding="1"]/tr[@*]') do date( "分割権利落ち日" ) brand( "銘柄" ) market( "市場" ) size( "割当比率" ) end end end def self.scraping( iPath, iClass = CorporateActionSplit ) # todo end end # class CorporateActionSplit
row( '//table[@width="550"]/tr[@*]' ) do
text( '//p', :type => :html_subtree )
text( '//p' ) do html( :type => :html_subtree ) end
script( lambda{ |string| return array }, :type => :script )
script( lambda{ |x| x.gsub!( / /, " " ) x.strip! @values = x.split( /\s+/m ) [] }, :type => :script ) brand( lambda{ [@values[0]] }, :type => :script ) ticker( lambda{ [@values[1]] }, :type => :script )
brand( @values[0], :type => :constant ) # @values[0] は空。
script( lambda{ |x| x.gsub!( / /, " " ) x.strip! x.split( /\s+/m ) }, :type => :script ) do brand( ":0" ) ticker( ":1" ) end
Scrubyt::Extractor.define do fetch( path ) # 戻り値に _table ブロックは含まれない。ブロックの中身は返ってくる。 _table( '//table' ) do th( "/th" ) # 下はエラーになる。 _td( "/td" ) end end
#!ruby -Ks require 'tidy' Tidy.path = 'tidy.dll' # パスが通っていればこれで読み込める。 input = "<title>Foo</title><table></tr></table><p>Foo! <<b>bold</b>>" xhtml = Tidy.open( :show_warnings => true ) do |tidy| tidy.options.language = "jp" # input: shiftjis tidy.options.input_encoding = "shiftjis" # output: shiftjis, utf8, utf16, utf16be, utf16le, raw, ... tidy.options.output_encoding = "shiftjis" tidy.options.output_xhtml = true xhtml = tidy.clean( input ) puts( tidy.errors ) puts( tidy.diagnostics ) xhtml end puts xhtml
module Scrubyt class PreFilterDocument def self.br_to_newline(doc) # doc = tidy_normalize(doc) doc.gsub(/<br[ \/]*>/i, "\r\n") doc = doc.tr("\240"," ") end end end
class OpenSSL::X509::Store alias :set_default_paths__hogehoge :set_default_paths def set_default_paths set_default_paths__hogehoge path = File.dirname( $0 ) path = File.join( path, "site.cer" ) self.add_file( path ) end end
<html> <head> <meta http-equiv="Content-type" content="text/html; charset=Shift_JIS"> <!-- 京都 --> </head> <script language="JavaScript"> var obj = external.menuArguments; var doc = obj.document; var ck = doc.cookie; alert( "cookie is \"" + ck + "\". copy to clipboard." ); var cb = doc.parentWindow.clipboardData; cb.setData( "text", ck ); </script> </html>
strTitle1 = "ちょこっと 強制削除" strFile1 = "c_del.html"
NoMethodError: undefined method `each' for nil:NilClass D:/ASR/lib/ruby/gems/1.8/gems/scrubyt-0.4.06/lib/scrubyt/utils/shared_utils.rb:42:in `traverse_for_match'
From: michael <...@gmail.com> Date: Sun, 26 Apr 2009 16:20:25 -0700 (PDT) Subject: Re: Can't get scrubyt to work You may be affected by a bug in shared_utils.rb try updating that file using: http://github.com/scrubber/scrubyt/commit/c97595d2...
@@ -39,7 +39,7 @@ module Scrubyt results << node results.delete node.parent if node.is_a? Hpricot::Elem end - node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) } + node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) } if ! node.children.nil? } traverse_for_match_inner.call(node,regexp) results