Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#

require "fileutils"

# A subclass of CbrainTask::ClusterTask to run SimpleFileExtractor.
class CbrainTask::SimpleFileExtractor < ClusterTask

Expand Down Expand Up @@ -94,9 +96,15 @@ def save_results #:nodoc:
ids = params[:interface_userfile_ids]

# Main inputs
patterns = patterns_as_array(params[:patterns].presence || {})

file_cols = FileCollection.where(:id => ids).to_a

patterns, repls, folds = patterns_as_arrays(
params[:patterns].presence || {},
params[:replace_paths].presence || {},
params[:folders].presence || {}
)

# Error and warning helpers
error_examples = {}
error_counts = {}
Expand Down Expand Up @@ -127,21 +135,30 @@ def save_results #:nodoc:
cache_path = userfile.cache_full_path
parent_cpath = cache_path.parent
patterns.each_with_index do |pat,patidx|
pat = pat.dup
pat = Pathname.new(pat).cleanpath

pat = pat.dup
pat = Pathname.new(pat).cleanpath
rep = repls[patidx]&.dup
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can be

rep = repls[patidx].dup

without the & as it if for folds[patidx] it will return nil if repls do not have patidx. No ?

rep = Pathname.new(rep) if rep
fold = folds[patidx].dup

# Replace "*/" at the beginning of a pattern with "userfilename/"
# This is just an optimization for flat dir DPs, removing one
# unneccesary level of globbing
if rep.present?
pat_orig = pat.to_s.dup
regexp = glob_to_regex(pat_orig.to_s) # to use gsub - potentially maybe allow use globe pattern or regex on will
end
if pat.to_s.starts_with?("*/")
pat = pat.to_s
pat[0] = userfile.name # replaces the *
pat = Pathname.new(pat)
end

# Quick safety check just like in after_form on portal side
cb_error "Wrong pattern encountered: #{pat}" if
(! pat.relative?) || (! pat.to_s.index('/')) || (pat.to_s.start_with? "../")
cb_error "Wrong pattern encountered: #{pat}" if (! pat.relative?) || (! pat.to_s.index('/')) || (pat.to_s.start_with? "../")
cb_error "Wrong replacement pattern: #{rep}" if rep && ( (! rep.relative?) || (rep.to_s.start_with? "../") )

path_pattern = parent_cpath + pat
globbed_paths=Dir.glob(path_pattern.to_s)
if globbed_paths.empty?
Expand All @@ -154,27 +171,45 @@ def save_results #:nodoc:
log_it.("Globbing through missing filesystem entries", pat, userfile, filepath)
next
end
if rep.present?
relpath = (Pathname.new filepath).relative_path_from(File.realpath(Pathname.new parent_cpath)) # new path
target = "extracted/" + relpath.to_s.gsub(regexp, rep.to_s)
else
basename = File.basename(filepath)
target = "extracted/#{basename}"
end
if ! filepath.start_with?(cache_path.to_s)
log_it.("Extraction outside collection", pat, userfile, filepath)
next
end
if ! (Pathname.new target).cleanpath.to_s.start_with?("extracted/")
log_it.("Probably bad renaming pattern", pat_orig.to_s + ' --- ' + rep.to_s, userfile, filepath)
next
end
if File.symlink?(filepath)
log_it.("Trying to extract a symbolic link", pat, userfile, filepath)
log_it.("Trying to extract a symbolic link", pat_orig, userfile, filepath)
next
end
if ! File.file?(filepath)
log_it.("Trying to extract a non regular file", pat, userfile, filepath)
if fold == "0" && ! File.file?(filepath)
log_it.("Trying to extract a non regular file", pat_orig, userfile, filepath)
next
end
basename = File.basename(filepath)
if File.file?("extracted/#{basename}")

if File.exist?(target)
log_it.("Trying to extract a file with a name matching something already extracted", pat, userfile, filepath)
next
end

unless rep.to_s.blank?
#makesure path exists
dir = File.dirname(target)
FileUtils.mkdir_p(dir)
end

# Make the copy
system "cp", "#{filepath}", "extracted/#{basename}" # no .bash_escape because no bash subshell
system "cp", "-rn", "#{filepath}", target # no .bash_escape because no bash subshell
status = $? # a Process::Status object
basename = File.basename(filepath)
if status.signaled?
self.addlog("Error copying file '#{basename}': got signal #{status.termsig || 'unknown'}. This is fatal.")
return false
Expand Down Expand Up @@ -231,4 +266,3 @@ def save_results #:nodoc:
# friends, described in the CbrainTask Programmer Guide.

end

Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,86 @@
# Model code common to the Bourreau and Portal side for SimpleFileExtractor.
class CbrainTask::SimpleFileExtractor

# In the params, the list of patterns is maintained as a hash:
# { "0" => "pat1", "1" => "pat2", etc }
# This returns just the array of values, while preserving the ordering
# that the keys encode:
# [ "pat1", "pat2" etc ]
def patterns_as_array(pat_hash)
keys = pat_hash.keys.sort { |a,b| a.to_i <=> b.to_i }
pat_array = keys.map { |i| pat_hash[i].presence }.compact
pat_array
# In the params, patterns, replacement path, folder flags are maintained as a hash:
# # { "0" => ["*/pat1"], "1" => "*/pat2", etc }, {"0": "subfolder1", "1" => nil...}
# # { "0" => ["*/pat1"], "1" => "*/pat2", etc }, {"0": "subfolder1", "1" => nil...}
# This method convert three hashes just the array of values, while preserving the ordering
# that the keys encode, and skipping empty rows:
# [ ["pat1", "pat2"], etc ]
# Usually some indexes with no info in either category
def patterns_as_arrays(pat_hash, repl_hash, fold_hash)
keys = pat_hash.keys.sort_by(&:to_i)
pat_array = keys.map do |i|
[
pat_hash[i]&.strip.presence,
repl_hash[i]&.strip.presence,
fold_hash[i]
]
end.select { |x, y, z| x || y || z == "1" } # filter out blank rows
return pat_array.transpose
end

# This does the opposite of patterns_as_array; given
# an array of patterns, returns a hash where the keys are
# This allows perform the opposite of patterns_as_array; given
# an array of patterns, path, or flags , returns array of hash where the keys are
# the index of the array
def patterns_as_hash(pat_array)
pat_hash = {}
pat_array.each_with_index { |pat,i| pat_hash[i.to_s] = pat }
pat_hash
# Hash it returns has with array indexes as values (stringifierd)
#
def array_to_hash(arr)
hsh = arr.map.with_index { |pat, i| [i.to_s, pat] }.to_h
hsh
end

end
# best effort mapping of a glob pattern to regex (with groups)
# https://stackoverflow.com/questions/1307712/how-to-convert-glob-to-regular-expression
def glob_to_regex(glob)
escaped = ''
i = 0
while i < glob.length
char = glob[i]

case char
when '*'
# Check for ** (recursive)
if glob[i, 2] == '**'
escaped << '(.+?)' # non-greedy match across directories
i += 1
else
escaped << '([^/]+)' # * matches a single path segment
end
when '?'
escaped << '(.)'
when '['
# Copy character class literally until closing ]
j = i + 1
while j < glob.length && glob[j] != ']'
j += 1
end
char_class = glob[i..j] # include the closing ]
escaped << char_class
i = j
when '{'
# Convert {a,b,c} → (a|b|c)
j = i + 1
brace_content = ''
depth = 1
while j < glob.length && depth > 0
if glob[j] == '{'
depth += 1
elsif glob[j] == '}'
depth -= 1
end
brace_content << glob[j] if depth > 0
j += 1
end
alternatives = brace_content.split(',').map { |x| Regexp.escape(x) }.join('|')
escaped << "(#{alternatives})"
i = j - 1
else
escaped << Regexp.escape(char) # escape other character
end
i += 1
end
Regexp.new("\\A#{escaped}\\z")
end

end
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@ class CbrainTask::SimpleFileExtractor < PortalTask
# is created with #:nodoc: in this template.
def self.default_launch_args #:nodoc:
{
:patterns => {}, # keys are numeric, values are the patterns
# keys are numeric, values are the patterns
:patterns => {},
:replace_paths => {},
# values are flags
:folders => Hash.new("0".freeze)
}
end

Expand All @@ -53,10 +57,19 @@ def after_form #:nodoc:
FileCollection.is_legal_filename?(out_name)

# Clean up pattern list
patterns = patterns_as_array(params[:patterns].presence || {})
patterns = patterns.map(&:presence).compact.map(&:strip).map(&:presence).compact # ignore blanks at each end
patterns = patterns.map { |pat| Pathname.new(pat).cleanpath }
params[:patterns] = patterns_as_hash(patterns.map(&:to_s)) # write back cleaned list

patterns, repls, folds = patterns_as_arrays(
params[:patterns].presence || {},
params[:replace_paths].presence || {},
params[:folders].presence || {}
)

patterns = patterns.map { |pat| Pathname.new(pat).cleanpath.to_s if pat}
repls = repls.map { |pat| Pathname.new(pat).cleanpath.to_s if pat }

params[:patterns] = array_to_hash(patterns)
params[:replace_paths] = array_to_hash(repls)
params[:folders] = array_to_hash(folds)

# Validate them and report errors; note that here the array contains Pathname objects
#
Expand All @@ -67,7 +80,26 @@ def after_form #:nodoc:
# */subdir/*/*.txt
# FileColName*/*/*.txt
patterns.each_with_index do |pat,idx|
if ! pat.relative?

rep = repls[idx]
fld = folds[idx]

if rep.present? && pat.blank?
self.params_errors.add("replace_paths[#{idx}]", "replacement path cannot be provided without pattern")
end
if fld.to_s == "1" && pat.blank?
self.params_errors.add("folders[#{idx}]", "folder extraction flag cannot be set without pattern")
end
if rep.present? && ! Pathname.new(rep).relative?
self.params_errors.add("replace_path[#{idx}]", "is not a relative path")
end
if rep.to_s.start_with? "../"
self.params_errors.add("replace_path[#{idx}]", "cannot map outside of collections")
end

next if pat.blank? # shortcut for pattern validation if it is not present

if pat && ! Pathname.new(pat).relative?
self.params_errors.add("patterns[#{idx}]", "is not a relative path")
end
if ! pat.to_s.index('/') # must contain at least 2 components
Expand Down Expand Up @@ -116,4 +148,3 @@ def validate_input_ids(ids)
end

end

Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,15 @@
</fieldset>

<fieldset>
<legend>Extraction patterns:</legend>
<legend>Patterns to match and place files </legend>
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure about this legend.

<% 10.times do |i| %>
<%= form.params_text_field "patterns[#{i}]", :size => 120 %><p>
<%= form.label "patterns[#{i}]", "Match pattern # #{i}"%>
<%= form.params_text_field "patterns[#{i}]", :size => 100 %> <p>
<%= form.label "replace_paths[#{i}]", "Substitute" %>
<%= form.params_text_field "replace_paths[#{i}]", :size => 100 %> <p></p>
<%= form.label "folders[#{i}]", "Extract folders?" %>
<%= form.params_check_box "folders[#{i}]" %> <p>
<hr>
<% end %>
<p>
<strong>About these patterns:</strong>
Expand All @@ -62,6 +68,10 @@
Typically, all patterns will start with <em>*/</em> because the first component must match
the names of the FileCollections themselves.
<p>
If you like you can provide a replace pattern, that can include \1, \2, to reference the 1rst, 2nd, etc. matched
wildcards (* or ?).
. \+ stands for the last and \0 stands for the entire path

See the 'help' link above for more information and examples.
</div>
</fieldset>
Expand All @@ -70,4 +80,3 @@
<legend>Output name</legend>
<%= form.params_text_field :output_file_name, :size => 40 %>
</fieldset>