Recent changes to this wiki:
in test suite display error from git push that fails to exit nonzero
diff --git a/Test.hs b/Test.hs index 9796608365..7c08d3e948 100644 --- a/Test.hs +++ b/Test.hs @@ -446,10 +446,12 @@ test_git_remote_annex exporttree git_annex "get" [] "get failed" () <- populate git "config" ["remote.foo.url", "annex::"] "git config" - git "push" ["foo", "master"] "git push" - git "push" ["foo", "git-annex"] "git push" + -- git push does not always propagate nonzero exit + -- status from git-remote-annex, so remember the + -- transcript and display it if clone fails + pushtranscript <- testProcess' "git" ["push", "foo", "master", "git-annex"] Nothing (== True) (const True) "git push" git "clone" ["annex::"++diruuid++"?"++intercalate "&" cfg', "clonedir"] - "git clone from special remote" + ("git clone from special remote (after git push with output: " ++ pushtranscript ++ ")") inpath "clonedir" $ git_annex "get" [annexedfile] "get from origin special remote" diruuid="89ddefa4-a04c-11ef-87b5-e880882a4f98" diff --git a/Test/Framework.hs b/Test/Framework.hs index c249e93529..ab6645308f 100644 --- a/Test/Framework.hs +++ b/Test/Framework.hs @@ -73,7 +73,11 @@ import qualified Command.Uninit -- In debug mode, the output is allowed to pass through. -- So the output does not get checked in debug mode. testProcess :: String -> [String] -> Maybe [(String, String)] -> (Bool -> Bool) -> (String -> Bool) -> String -> Assertion -testProcess command params environ expectedret expectedtranscript faildesc = do +testProcess command params environ expectedret expectedtranscript faildesc = + void $ testProcess' command params environ expectedret expectedtranscript faildesc + +testProcess' :: String -> [String] -> Maybe [(String, String)] -> (Bool -> Bool) -> (String -> Bool) -> String -> IO String +testProcess' command params environ expectedret expectedtranscript faildesc = do let p = (proc command params) { env = environ } debug <- testDebug . testOptions <$> getTestMode if debug @@ -81,10 +85,12 @@ testProcess command params environ expectedret expectedtranscript faildesc = do ret <- withCreateProcess p $ \_ _ _ pid -> waitForProcess pid (expectedret (ret == ExitSuccess)) @? (faildesc ++ " failed with unexpected exit code") + return "" else do (transcript, ret) <- Utility.Process.Transcript.processTranscript' p Nothing (expectedret ret) @? (faildesc ++ " failed with unexpected exit code (transcript follows)\n" ++ transcript) (expectedtranscript transcript) @? (faildesc ++ " failed with unexpected output (transcript follows)\n" ++ transcript) + return transcript -- Run git. (Do not use to run git-annex as the one being tested -- may not be in path.) diff --git a/doc/bugs/tests_started_to_fail_recently/comment_5_edf45de127e639174893775d41e5a6c5._comment b/doc/bugs/tests_started_to_fail_recently/comment_5_edf45de127e639174893775d41e5a6c5._comment new file mode 100644 index 0000000000..b7eb3e6fda --- /dev/null +++ b/doc/bugs/tests_started_to_fail_recently/comment_5_edf45de127e639174893775d41e5a6c5._comment @@ -0,0 +1,32 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 5""" + date="2024-11-20T17:42:57Z" + content=""" +My arm64-ancient build failed today in the same way as the OSX build is failing, +so I should be able to debug it there. + + builder@sparrow:~/x/a$ git push d git-annex + Full remote url: annex::f88d4965-fc4f-4dd0-aac2-eaf19c9bcfa5?encryption=none&type=directory + fatal: Refusing to create empty bundle. + Push failed (user error (git ["--git-dir=.git","--work-tree=.","--literal-pathspecs","bundle","create","--quiet","/tmp/GITBUNDLE1049637-0","--stdin"] exited 128)) + warning: helper reported unexpected status of push + Everything up-to-date + builder@sparrow:~/x/a$ echo $? + 0 + +Huh ok, so git-remote-annex is failing to push, which is why clone +later fails. And for whatever reason git doesn't propigate the error, which +is why this is not visible in the transcript. + +That build uses git 2.30.2. That git bundle --stdin was broken and +didn't read refs from stdin at all. Also it had other bugs. I think it's +best not to try to support git-remote-annex with that version of git at +all, given those bugs. + +That probably won't help with the OSX failure, which is with a very new git +version. So I also made the test +suite capture the git push output even when it exits successfully, so it +can display it when the git pull fails. That should show what the problem +is there. +"""]]
comment
diff --git a/doc/todo/p2phttp_serve_multiple_repositories/comment_1_28942d454244ea6df6aabed03b43d8a3._comment b/doc/todo/p2phttp_serve_multiple_repositories/comment_1_28942d454244ea6df6aabed03b43d8a3._comment new file mode 100644 index 0000000000..d67d34a1dc --- /dev/null +++ b/doc/todo/p2phttp_serve_multiple_repositories/comment_1_28942d454244ea6df6aabed03b43d8a3._comment @@ -0,0 +1,8 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2024-11-20T17:41:12Z" + content=""" +I have some early work toward implementing this in the `p2phttp-multi` +branch. +"""]]
Added a comment
diff --git a/doc/bugs/tests_started_to_fail_recently/comment_4_989bcca4ecd2a00c509585034d707547._comment b/doc/bugs/tests_started_to_fail_recently/comment_4_989bcca4ecd2a00c509585034d707547._comment new file mode 100644 index 0000000000..459dbf6bb8 --- /dev/null +++ b/doc/bugs/tests_started_to_fail_recently/comment_4_989bcca4ecd2a00c509585034d707547._comment @@ -0,0 +1,9 @@ +[[!comment format=mdwn + username="yarikoptic" + avatar="http://cdn.libravatar.org/avatar/f11e9c84cb18d26a1748c33b48c924b4" + subject="comment 4" + date="2024-11-20T00:22:44Z" + content=""" +- re mac: try `joey@datalads-imac2` from `smaug` +- a few times we used https://github.com/mxschmitt/action-tmate to interactively debug on github CI... want us to bolt it on? +"""]]
reuse http url password for p2phttp url when on same host
When remote.name.annexUrl is an annex+http(s) url, that uses the same
hostname as remote.name.url, which is itself a http(s) url, they are
assumed to share a username and password.
This avoids unnecessary duplicate password prompts.
When remote.name.annexUrl is an annex+http(s) url, that uses the same
hostname as remote.name.url, which is itself a http(s) url, they are
assumed to share a username and password.
This avoids unnecessary duplicate password prompts.
diff --git a/CHANGELOG b/CHANGELOG index 7e523186c6..249ed77549 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -17,6 +17,10 @@ git-annex (10.20241032) UNRELEASED; urgency=medium versioned S3 bucket. * git-remote-annex: Fix cloning from a special remote on a crippled filesystem. + * When remote.name.annexUrl is an annex+http(s) url, that + uses the same hostname as remote.name.url, which is itself a http(s) + url, they are assumed to share a username and password. This avoids + unnecessary duplicate password prompts. -- Joey Hess <id@joeyh.name> Mon, 11 Nov 2024 12:26:00 -0400 diff --git a/P2P/Http/Client.hs b/P2P/Http/Client.hs index d047eca7a0..c708908a19 100644 --- a/P2P/Http/Client.hs +++ b/P2P/Http/Client.hs @@ -37,6 +37,7 @@ import Annex.Concurrent import Utility.Url (BasicAuth(..)) import Utility.HumanTime import Utility.STM +import qualified Git import qualified Git.Credential as Git import Servant hiding (BasicAuthData(..)) @@ -83,8 +84,19 @@ p2pHttpClientVersions -> (String -> Annex a) -> ClientAction a -> Annex (Maybe a) +p2pHttpClientVersions allowedversion rmt fallback clientaction = do + rmtrepo <- getRepo rmt + p2pHttpClientVersions' allowedversion rmt rmtrepo fallback clientaction + +p2pHttpClientVersions' + :: (ProtocolVersion -> Bool) + -> Remote + -> Git.Repo + -> (String -> Annex a) + -> ClientAction a + -> Annex (Maybe a) #ifdef WITH_SERVANT -p2pHttpClientVersions allowedversion rmt fallback clientaction = +p2pHttpClientVersions' allowedversion rmt rmtrepo fallback clientaction = case p2pHttpBaseUrl <$> remoteAnnexP2PHttpUrl (gitconfig rmt) of Nothing -> error "internal" Just baseurl -> do @@ -139,9 +151,13 @@ p2pHttpClientVersions allowedversion rmt fallback clientaction = ++ " " ++ decodeBS (statusMessage (responseStatusCode resp)) - credentialbaseurl = case p2pHttpUrlString <$> remoteAnnexP2PHttpUrl (gitconfig rmt) of + credentialbaseurl = case remoteAnnexP2PHttpUrl (gitconfig rmt) of + Just p2phttpurl + | isP2PHttpSameHost p2phttpurl rmtrepo -> + Git.repoLocation rmtrepo + | otherwise -> + p2pHttpUrlString p2phttpurl Nothing -> error "internal" - Just url -> url credauth cred = do ba <- Git.credentialBasicAuth cred @@ -159,7 +175,7 @@ p2pHttpClientVersions allowedversion rmt fallback clientaction = M.insert (Git.CredentialBaseURL credentialbaseurl) cred cc Nothing -> noop #else -p2pHttpClientVersions _ _ fallback () = Just <$> fallback +p2pHttpClientVersions _ _ _ fallback () = Just <$> fallback "This remote uses an annex+http url, but this version of git-annex is not built with support for that." #endif diff --git a/P2P/Http/Url.hs b/P2P/Http/Url.hs index 9e1af2c8dc..b7ec6d22fe 100644 --- a/P2P/Http/Url.hs +++ b/P2P/Http/Url.hs @@ -15,6 +15,9 @@ import Network.URI import System.FilePath.Posix as P import Servant.Client (BaseUrl(..), Scheme(..)) import Text.Read +import Data.Char +import qualified Git +import qualified Git.Url #endif defaultP2PHttpProtocolPort :: Int @@ -79,3 +82,15 @@ unavailableP2PHttpUrl p = p #ifdef WITH_SERVANT { p2pHttpBaseUrl = (p2pHttpBaseUrl p) { baseUrlHost = "!dne!" } } #endif + +#ifdef WITH_SERVANT +-- When a p2phttp url is on the same host as a git repo, which also uses +-- http, the same username+password is assumed to be used for both. +isP2PHttpSameHost :: P2PHttpUrl -> Git.Repo -> Bool +isP2PHttpSameHost u repo + | not (Git.repoIsHttp repo) = False + | otherwise = + Just (map toLower $ baseUrlHost (p2pHttpBaseUrl u)) + == + (map toLower <$> (Git.Url.host repo)) +#endif diff --git a/doc/git-annex-p2phttp.mdwn b/doc/git-annex-p2phttp.mdwn index 802c52d929..3d10f62198 100644 --- a/doc/git-annex-p2phttp.mdwn +++ b/doc/git-annex-p2phttp.mdwn @@ -20,6 +20,12 @@ as usual, and `remote.name.annexUrl` set to an annex+http url such as "annex+http://example.com/git-annex/". The annex+http url is served by this server, and uses port 9417 by default. +Note that, when `remote.name.url` and `remote.name.annexUrl` +contain the same hostname, they are assumed by git-annex to +support the same users and passwords. So, git-annex will use +the password for the `remote.name.url` to log into the +`remote.name.annexUrl`. + As well as serving the git-annex HTTP API, this server provides a convenient way to download the content of any key, by using the path "/git-annex/$uuid/$key". For example: diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 178ad146d3..a082c97647 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -1569,6 +1569,13 @@ Remotes are configured using these settings in `.git/config`. git operations. This allows using [[git-annex-p2phttp]] to serve a git-annex repository over http. + When this and the `remote.<name>.url` contain the same hostname, + and this is an annex+http(s) url, and that is also a http(s) url, + git-annex assumes that the same username and password can be used + for both urls. When password cacheing is configured, this allows + you to only be prompted once for a password when using both git and + git-annex. See gitcredentials(7) for how to set up password caching. + * `remote.<name>.annex-uuid` git-annex caches UUIDs of remote repositories here. diff --git a/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host.mdwn b/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host.mdwn index da46f5b7f1..cbef6aebca 100644 --- a/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host.mdwn +++ b/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host.mdwn @@ -13,3 +13,6 @@ I see some ways to address this: 3. Perhaps most elegantly: make p2phttp support serving multiple repositories, so that repositories could share the same annexurl and therefore share credentials [[!tag projects/INM7]] + +> I have implemented reuse of the remote.name.url password for +> remote.name.annexurl when they are on the same host. [[done]] --[[Joey]]
update
diff --git a/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_3_7ea1596e9c9c06ef609a8aa6bccefd29._comment b/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_2_7ea1596e9c9c06ef609a8aa6bccefd29._comment similarity index 97% rename from doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_3_7ea1596e9c9c06ef609a8aa6bccefd29._comment rename to doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_2_7ea1596e9c9c06ef609a8aa6bccefd29._comment index be11c10dfd..69c350d2a3 100644 --- a/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_3_7ea1596e9c9c06ef609a8aa6bccefd29._comment +++ b/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_2_7ea1596e9c9c06ef609a8aa6bccefd29._comment @@ -1,6 +1,6 @@ [[!comment format=mdwn username="joey" - subject="""comment 3""" + subject="""comment 2""" date="2024-11-19T17:37:01Z" content=""" credential.useHttpPath is the relevant git config for this git-credential diff --git a/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_2_63806afed3ab03308584415506183ced._comment b/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_3_63806afed3ab03308584415506183ced._comment similarity index 54% rename from doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_2_63806afed3ab03308584415506183ced._comment rename to doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_3_63806afed3ab03308584415506183ced._comment index adee58f09b..fa24e48a0a 100644 --- a/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_2_63806afed3ab03308584415506183ced._comment +++ b/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_3_63806afed3ab03308584415506183ced._comment @@ -1,6 +1,6 @@ [[!comment format=mdwn username="joey" - subject="""comment 2""" + subject="""comment 3""" date="2024-11-19T17:19:38Z" content=""" Unfortunately, remote.foo.annexUrl is not limited to use for p2phttp. It @@ -21,4 +21,20 @@ prompt. So, I think it makes sense to only do this when credential.helper is configured. And when the hostname is the same in both the git url and the p2phttp url. + +Hmm, I can imagine a situation where this behavior could be considered a +security hole. Suppose A and B both have accounts on the same host. A is in +charge of serving the git repositories. B is in charge of serving git-annex +p2phttp. This would make git-annex prompt for a password to +one of user A's git repositories, and send the password to user B. So B +would be able to crack into the git repos. + +That is pretty farfetched. But it begs the question: If the git +repository and p2phttp are on the same host, why would they *ever* need 2 +distinct passwords? If git-annex simply doesn't support that A/B split, +then that security hole can't happen. + +So, git-annex could simply, when the git url and p2phttp url have the same +hostname, request the git credentials for the git url, rather than for the +p2phttp url. """]]
comments
diff --git a/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_2_63806afed3ab03308584415506183ced._comment b/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_2_63806afed3ab03308584415506183ced._comment new file mode 100644 index 0000000000..adee58f09b --- /dev/null +++ b/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_2_63806afed3ab03308584415506183ced._comment @@ -0,0 +1,24 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 2""" + date="2024-11-19T17:19:38Z" + content=""" +Unfortunately, remote.foo.annexUrl is not limited to use for p2phttp. It +existed before that and could be legitimately set to a http url when +p2phttp is not being used. + +I agree it would be good to try to reuse the credentials of the git url for +p2phttp. That could be done by just querying git credential for the git url +credentials, and trying to use them for the p2phttp url. If they don't work, +use git credential to prompt for the p2phttp url credentials as it does now. + +If the user had credential.helper configured, they would probably already +have the git credentials cached, and if not, this would cache them for +later use, so no harm done asking for them. But if credential.helper was +not configured, there would be an extra and wholly unncessary password +prompt. + +So, I think it makes sense to only do this when credential.helper is +configured. And when the hostname is the same in both the git url +and the p2phttp url. +"""]] diff --git a/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_3_7ea1596e9c9c06ef609a8aa6bccefd29._comment b/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_3_7ea1596e9c9c06ef609a8aa6bccefd29._comment new file mode 100644 index 0000000000..be11c10dfd --- /dev/null +++ b/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_3_7ea1596e9c9c06ef609a8aa6bccefd29._comment @@ -0,0 +1,24 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 3""" + date="2024-11-19T17:37:01Z" + content=""" +credential.useHttpPath is the relevant git config for this git-credential +behavior. + +I think it would be reasonable for git-annex to check if that is false, and +if so, remove the path from the `git credential` request for an annex+http +url. + +But I agree, it would be better, in the vast majority of cases, to have a +single url endpoint that serves multiple repositories. + +And for that matter, if someone is running git-annex p2phttp to serve 2 +different repositories right now, they are probably making the two listen +on different ports and so removing the path wouldn't help. They would have +to be interposing another web server that mapped those ports to paths, like +you have done with forgejo-aneksajo, for the path mangling to help. + +So implementing [[todo/p2phttp_serve_multiple_repositories]] +seems better than adding such path mangling. +"""]]
tag INM7
diff --git a/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host.mdwn b/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host.mdwn index 075c4b57a3..da46f5b7f1 100644 --- a/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host.mdwn +++ b/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host.mdwn @@ -11,3 +11,5 @@ I see some ways to address this: 1. Remove the path from the request to `git credential` on git-annex' side 2. Allow `remote.<name>.annexurl` to be set to `http(s)://` URLs in addition to `annex+http(s)://`, exploiting the difference in the `git credential` behavior 3. Perhaps most elegantly: make p2phttp support serving multiple repositories, so that repositories could share the same annexurl and therefore share credentials + +[[!tag projects/INM7]]
comment
diff --git a/doc/bugs/tests_started_to_fail_recently/comment_3_2acec0272bc0f9ad0e706797851c5345._comment b/doc/bugs/tests_started_to_fail_recently/comment_3_2acec0272bc0f9ad0e706797851c5345._comment new file mode 100644 index 0000000000..cf9afe3ee0 --- /dev/null +++ b/doc/bugs/tests_started_to_fail_recently/comment_3_2acec0272bc0f9ad0e706797851c5345._comment @@ -0,0 +1,13 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 3""" + date="2024-11-19T17:04:13Z" + content=""" +Aha, this test on ubuntu is failing the same way as the OSX test: + +<https://github.com/datalad/git-annex/actions/runs/11905453897/job/33176247387> + +It seems that "custom-config1" only involves a annex.stalldetection +setting, if I am reading the workflow file right. I was not able to +reproduce the failure with that config set though. +"""]]
split git-remote-annex test
diff --git a/Test.hs b/Test.hs index 605bd85fc1..a850cae258 100644 --- a/Test.hs +++ b/Test.hs @@ -282,7 +282,8 @@ repoTests note numparts = map mk $ sep [ testCase "add dup" test_add_dup , testCase "add extras" test_add_extras , testCase "add moved link" test_add_moved - , testCase "git-remote-annex" test_git_remote_annex + , testCase "git-remote-annex" (test_git_remote_annex False) + , testCase "git-remote-annex exporttree" (test_git_remote_annex True) , testCase "readonly remote" test_readonly_remote , testCase "ignore deleted files" test_ignore_deleted_files , testCase "metadata" test_metadata @@ -422,12 +423,14 @@ test_add_extras = intmpclonerepo $ do annexed_present wormannexedfile checkbackend wormannexedfile backendWORM -test_git_remote_annex :: Assertion -test_git_remote_annex = do - testspecialremote [] $ - git_annex "copy" ["--to=foo"] "copy" - testspecialremote ["importtree=yes", "exporttree=yes"] $ - git_annex "export" ["master", "--to=foo"] "export" +test_git_remote_annex :: Bool -> Assertion +test_git_remote_annex exporttree + | exporttree = + testspecialremote ["importtree=yes", "exporttree=yes"] $ + git_annex "export" ["master", "--to=foo"] "export" + | otherwise = + testspecialremote [] $ + git_annex "copy" ["--to=foo"] "copy" where testspecialremote cfg populate = intmpclonerepo $ do let cfg' = ["type=directory", "encryption=none", "directory=dir"] ++ cfg diff --git a/doc/bugs/tests_started_to_fail_recently/comment_2_a58e3ebcd37f866d7154f66da8c01929._comment b/doc/bugs/tests_started_to_fail_recently/comment_2_a58e3ebcd37f866d7154f66da8c01929._comment new file mode 100644 index 0000000000..ecc2322267 --- /dev/null +++ b/doc/bugs/tests_started_to_fail_recently/comment_2_a58e3ebcd37f866d7154f66da8c01929._comment @@ -0,0 +1,18 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 2""" + date="2024-11-19T16:48:25Z" + content=""" +Re the OSX failure, it seems that somehow the manifest key is not being +found when the test is run on OSX. I don't know why. There is nothing in +this code that should be OSX-specific. + +Unfortunately I do have access to any OSX system to try to investigate +this. The "datalads-mac" I used to use does not seem to exist anymore. + +Of course, this test could be skipped on OSX. + +Does occur to me this could somehow be exposing a deeper problem on OSX +with exporttree special remotes. I have split the failing test in two, so +we'll see if both fail, or only the exporttree one. +"""]]
retitle OSX bug
diff --git a/doc/bugs/tests_started_to_fail_recently.mdwn b/doc/bugs/tests_started_to_fail_recently.mdwn index 77bed9042e..b4c663f906 100644 --- a/doc/bugs/tests_started_to_fail_recently.mdwn +++ b/doc/bugs/tests_started_to_fail_recently.mdwn @@ -74,3 +74,5 @@ although there in first failing was a bit different on OSX Use -p '/git-remote-annex/' to rerun this test only. ``` + +[[!meta title="git-remote-annex clone from special remote fails on OSX"]]
git-remote-annex: Fix cloning from a special remote on a crippled filesystem
Not initializing and so deleting the bundles only causes a little more work
on the first git fetch.
Not initializing and so deleting the bundles only causes a little more work
on the first git fetch.
diff --git a/CHANGELOG b/CHANGELOG index 254f914b59..7e523186c6 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -15,6 +15,8 @@ git-annex (10.20241032) UNRELEASED; urgency=medium unversioned S3 bucket that is large enough to need pagination. * S3: Use significantly less memory when importing from a versioned S3 bucket. + * git-remote-annex: Fix cloning from a special remote on a crippled + filesystem. -- Joey Hess <id@joeyh.name> Mon, 11 Nov 2024 12:26:00 -0400 diff --git a/CmdLine/GitRemoteAnnex.hs b/CmdLine/GitRemoteAnnex.hs index 36d2446e4e..89ded7191c 100644 --- a/CmdLine/GitRemoteAnnex.hs +++ b/CmdLine/GitRemoteAnnex.hs @@ -1129,7 +1129,7 @@ specialRemoteFromUrl sab a = withTmpDir "journal" $ \tmpdir -> do -- If the git-annex branch did not exist when this command started, -- it was created empty by this command, and this command has avoided -- making any other commits to it, writing any temporary annex branch --- changes to thre alternateJournal, which can now be discarded. +-- changes to the alternateJournal, which can now be discarded. -- -- If nothing else has written to the branch while this command was running, -- the branch will be deleted. That allows for the git-annex branch that is @@ -1152,6 +1152,11 @@ specialRemoteFromUrl sab a = withTmpDir "journal" $ \tmpdir -> do -- does not contain any hooks. Since initialization installs -- hooks, have to work around that by not initializing, and -- delete the git bundle objects. +-- +-- Similarly, when on a crippled filesystem, doing initialization would +-- involve checking out an adjusted branch. But git clone wants to do its +-- own checkout. So no initialization is done then, and the git bundle +-- objects are deleted. cleanupInitialization :: StartAnnexBranch -> FilePath -> Annex () cleanupInitialization sab alternatejournaldir = void $ tryNonAsync $ do liftIO $ mapM_ removeFile =<< dirContents alternatejournaldir @@ -1173,7 +1178,7 @@ cleanupInitialization sab alternatejournaldir = void $ tryNonAsync $ do Nothing -> return () Just _ -> void $ tryNonAsync $ inRepo $ Git.Branch.delete Annex.Branch.fullname - ifM (Annex.Branch.hasSibling <&&> nonbuggygitversion) + ifM (Annex.Branch.hasSibling <&&> nonbuggygitversion <&&> notcrippledfilesystem) ( do autoInitialize' (pure True) startupAnnex remoteList differences <- allDifferences <$> recordedDifferences @@ -1190,6 +1195,8 @@ cleanupInitialization sab alternatejournaldir = void $ tryNonAsync $ do _ -> noop void $ liftIO $ tryIO $ removeDirectory (decodeBS annexobjectdir) + notcrippledfilesystem = not <$> probeCrippledFileSystem + nonbuggygitversion = liftIO $ flip notElem buggygitversions <$> Git.Version.installed buggygitversions = map Git.Version.normalize diff --git a/doc/bugs/tests_started_to_fail_recently/comment_1_c07a23f5d8524ba8f97187ade6eeb441._comment b/doc/bugs/tests_started_to_fail_recently/comment_1_c07a23f5d8524ba8f97187ade6eeb441._comment new file mode 100644 index 0000000000..8f4197908f --- /dev/null +++ b/doc/bugs/tests_started_to_fail_recently/comment_1_c07a23f5d8524ba8f97187ade6eeb441._comment @@ -0,0 +1,17 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2024-11-19T16:18:31Z" + content=""" +This is a new test. + +Looks like it's found a legitimate bug in git-remote-annex. When the +filesystem is crippled, the git-annex init checks out an adjusted branch, +which here happens in the middle of git's own checkout and so legitimately +confuses git. + +I can reproduce this on a FAT filesystem, cloning from eg a directory +special remote. Fixed this. + +(The OSX failure is something else.) +"""]]
initial report on failing tests
diff --git a/doc/bugs/tests_started_to_fail_recently.mdwn b/doc/bugs/tests_started_to_fail_recently.mdwn new file mode 100644 index 0000000000..77bed9042e --- /dev/null +++ b/doc/bugs/tests_started_to_fail_recently.mdwn @@ -0,0 +1,76 @@ +### Please describe the problem. + +eg from [this recent run](https://github.com/datalad/git-annex/actions/runs/11875458683/job/33092822672) + +``` +Tests + Repo Tests v10 adjusted unlocked branch + Init Tests + init: OK (0.43s) + add: OK (0.83s) + sop crypto: OK + upgrade: OK (0.52s) + conflict resolution (uncommitted local file): OK (4.99s) + adjusted branch merge regression: OK (1.09s) + describe: OK (0.62s) + fsck (local untrusted): OK (1.60s) + lock --force: OK (2.29s) + drop (untrusted remote): OK (1.69s) + view: OK (0.91s) + git-remote-annex: FAIL (3.01s) + ./Test/Framework.hs:86: + git clone from special remote failed with unexpected exit code (transcript follows) + Cloning into 'clonedir'... + Detected a filesystem without fifo support. + Disabling ssh connection caching. + Detected a crippled filesystem. + Entering an adjusted branch where files are unlocked as this filesystem does not support locked files. + Switched to branch 'adjusted/master(unlocked)' + error: Untracked working tree file 'bar.c' would be overwritten by merge. + fatal: unable to checkout working tree + warning: Clone succeeded, but checkout failed. + You can inspect what was checked out with 'git status' + and retry with 'git restore --source=HEAD :/' + + + Use -p '/git-remote-annex/' to rerun this test only. + +1 out of 12 tests failed (17.99s) +``` + +overall -- seems started to fail about a week ago + +``` + 167 T Nov 17 GitHub Actions datalad/git-annex daily summary: 20 PASSED, 10 FAILED, 1 ABSENT + 238 T Nov 16 GitHub Actions datalad/git-annex daily summary: 20 PASSED, 10 FAILED, 1 ABSENT + 348 T Nov 15 GitHub Actions datalad/git-annex daily summary: 23 PASSED, 7 FAILED, 1 ABSENT + 890 T Nov 14 GitHub Actions datalad/git-annex daily summary: 23 PASSED, 7 FAILED, 1 ABSENT +1676 T Nov 13 GitHub Actions datalad/git-annex daily summary: 22 PASSED, 8 FAILED, 1 ABSENT +2032 T Nov 12 GitHub Actions datalad/git-annex daily summary: 23 PASSED, 7 FAILED, 1 ABSENT +2561 T Nov 11 GitHub Actions datalad/git-annex daily summary: 30 PASSED, 1 ABSENT +``` + +although there in first failing was a bit different on OSX + +``` + Repo Tests v10 locked + Init Tests + init: OK (0.43s) + add: OK (1.17s) + sop crypto: OK + upgrade: OK (0.62s) + conflict resolution (uncommitted local file): OK (5.93s) + adjusted branch merge regression: OK (7.74s) + describe: OK (0.92s) + fsck (local untrusted): OK (1.87s) + lock --force: OK (1.64s) + drop (untrusted remote): OK (1.38s) + view: OK (1.48s) + git-remote-annex: FAIL (2.95s) + ./Test/Framework.hs:86: + git clone from special remote failed with unexpected exit code (transcript follows) + Cloning into 'clonedir'... + git-annex: No git repository found in this remote. + + Use -p '/git-remote-annex/' to rerun this test only. +```
update
diff --git a/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_8_017cf9156e94b1587f1853504d6c2de1._comment b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_8_017cf9156e94b1587f1853504d6c2de1._comment index e6b9800971..a361760605 100644 --- a/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_8_017cf9156e94b1587f1853504d6c2de1._comment +++ b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_8_017cf9156e94b1587f1853504d6c2de1._comment @@ -20,4 +20,7 @@ oddly didn't save any memory. Memory profiling might let this be improved further, but needing 1 gb of memory to import a million changes to files doesn't seem too bad. + +Update: Did some memory profiling, nothing stuck out as badly wrong. +Lists and tuples are using as much memory as anything. """]]
close
diff --git a/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix.mdwn b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix.mdwn index 8f4723e67d..6a9c2349be 100644 --- a/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix.mdwn +++ b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix.mdwn @@ -70,3 +70,6 @@ local repository version: 10 [[!meta author=yoh]] [[!tag projects/dandi]] + +> Calling this [[done]] although memory use improvements still seem +> possible.. --[[Joey]]
comments
diff --git a/doc/bugs/git-annex-import_stalls_and_uses_all_ram_available/comment_4_94b241ec93018adce716ceeed4bffd44._comment b/doc/bugs/git-annex-import_stalls_and_uses_all_ram_available/comment_4_94b241ec93018adce716ceeed4bffd44._comment new file mode 100644 index 0000000000..e1cf41e901 --- /dev/null +++ b/doc/bugs/git-annex-import_stalls_and_uses_all_ram_available/comment_4_94b241ec93018adce716ceeed4bffd44._comment @@ -0,0 +1,10 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 4""" + date="2024-11-15T19:29:52Z" + content=""" +FWIW, I've made some improvements that should make it need around 80% less +memory in this case. Which might be enough to let it import. + +Still don't have filtering on preferred contents on the fly though. +"""]] diff --git a/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_8_017cf9156e94b1587f1853504d6c2de1._comment b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_8_017cf9156e94b1587f1853504d6c2de1._comment new file mode 100644 index 0000000000..e6b9800971 --- /dev/null +++ b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_8_017cf9156e94b1587f1853504d6c2de1._comment @@ -0,0 +1,23 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 8""" + date="2024-11-15T17:48:08Z" + content=""" +Did same memory optimisation for the versioned case, and the results are +striking! Running the command until it had made 45 API requests, it was +using 592788 kb of memory. Now it uses only 110968 kb. + +Of that, about 78900 kb are used at startup, so it grew 29836 kb. +At that point, it has gathered 23537 changes. So about 1 kb is used per +change. That seems a bit more memory than really should be needed, +each change takes about 75 bytes of data, eg: + + "y3RixvrmLvr1oWJ7meEa4vWK6B.C.aad",3340,"dandisets/000003/draft/dandiset.jsonld",2021-09-28 02:12:39 UTC + +I did try some further memory optimisation, making it avoid storing the +same filename repeatedly in memory when gathering versioned changes. Which +oddly didn't save any memory. + +Memory profiling might let this be improved further, but needing 1 gb of +memory to import a million changes to files doesn't seem too bad. +"""]]
use 20% less memory when listing unversioned S3 bucket
diff --git a/Remote/S3.hs b/Remote/S3.hs index 299f7d7644..36cbedef50 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -601,15 +601,29 @@ listImportableContentsS3 hv r info c = { S3.gbMarker = marker , S3.gbPrefix = fileprefix } - continuelistunversioned h (rsp:l) rsp' + l' <- extractFromResourceT $ + extractunversioned rsp + continuelistunversioned h (l':l) rsp' Nothing -> nomore | otherwise = nomore where nomore = return $ - mkImportableContentsUnversioned info (reverse (rsp:l)) + mkImportableContentsUnversioned + (reverse (extractunversioned rsp:l)) + extractunversioned = mapMaybe extractunversioned' . S3.gbrContents + extractunversioned' oi = do + loc <- bucketImportLocation info $ + T.unpack $ S3.objectKey oi + let sz = S3.objectSize oi + let cid = mkS3UnversionedContentIdentifier $ S3.objectETag oi + return (loc, (cid, sz)) + continuelistversioned h l rsp | S3.gbovrIsTruncated rsp = do + let showme x = case x of + S3.DeleteMarker {} -> "delete" + v -> S3.oviKey v rsp' <- sendS3Handle h $ (S3.getBucketObjectVersions (bucket info)) { S3.gbovKeyMarker = S3.gbovrNextKeyMarker rsp @@ -620,18 +634,11 @@ listImportableContentsS3 hv r info c = | otherwise = return $ mkImportableContentsVersioned info (reverse (rsp:l)) -mkImportableContentsUnversioned :: S3Info -> [S3.GetBucketResponse] -> ImportableContents (ContentIdentifier, ByteSize) -mkImportableContentsUnversioned info l = ImportableContents - { importableContents = concatMap (mapMaybe extract . S3.gbrContents) l +mkImportableContentsUnversioned :: [[(ImportLocation, (ContentIdentifier, ByteSize))]] -> ImportableContents (ContentIdentifier, ByteSize) +mkImportableContentsUnversioned l = ImportableContents + { importableContents = concat l , importableHistory = [] } - where - extract oi = do - loc <- bucketImportLocation info $ - T.unpack $ S3.objectKey oi - let sz = S3.objectSize oi - let cid = mkS3UnversionedContentIdentifier $ S3.objectETag oi - return (loc, (cid, sz)) mkImportableContentsVersioned :: S3Info -> [S3.GetBucketObjectVersionsResponse] -> ImportableContents (ContentIdentifier, ByteSize) mkImportableContentsVersioned info = build . groupfiles diff --git a/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_7_fe6e9bc5460f9bcd24eb3034a2f45fbc._comment b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_7_fe6e9bc5460f9bcd24eb3034a2f45fbc._comment new file mode 100644 index 0000000000..abeaf7d584 --- /dev/null +++ b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_7_fe6e9bc5460f9bcd24eb3034a2f45fbc._comment @@ -0,0 +1,16 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 7""" + date="2024-11-15T17:16:51Z" + content=""" +Trying the same command but with versioning=yes, I have verified that + +* it does not have the same loop forever behavior +* it does use a lot of memory quite quickly + +Going back to the unversioned command, I was able to reduce the memory use +by 20% by processing each result, rather than building up a list of results +and processing at the end. It will be harder to do that in the versioning +case, but I expect it will improve it at least that much, and probably +more, since it will be able to GC all the delete markers. +"""]]
diff --git a/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn b/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn index 45af5b2ee6..fc1e5bb168 100644 --- a/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn +++ b/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn @@ -15,6 +15,6 @@ However, git annex exits without copying any files, my repo is still empty after I also tried git annex findkeys --not --unused, but it says invalid option --unused :-( -In my example I have multiple repositories that all have part of the files I want, so I cannot just make a repo that has all versions of all files and then `drop --unused`. That also would take too much storage. +In real life I have multiple repositories that all have part of the files I want, so I cannot just make a repo that has all versions of all files and then `drop --unused`. That also would take too much storage. How can I do this?
diff --git a/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn b/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn index 2c4b1a3289..45af5b2ee6 100644 --- a/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn +++ b/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn @@ -15,4 +15,6 @@ However, git annex exits without copying any files, my repo is still empty after I also tried git annex findkeys --not --unused, but it says invalid option --unused :-( +In my example I have multiple repositories that all have part of the files I want, so I cannot just make a repo that has all versions of all files and then `drop --unused`. That also would take too much storage. + How can I do this?
diff --git a/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn b/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn index cdc9b6bf8b..2c4b1a3289 100644 --- a/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn +++ b/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn @@ -4,10 +4,12 @@ I tried to clone a present repository to an new folder and move there only files But git annex does nothing: +``` git clone my-repo repo-archive cd repo-archive git annex init git annex copy --to=here --not --unused +``` However, git annex exits without copying any files, my repo is still empty afterwards.
Added a comment
diff --git a/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_1_139620857a275559b06fee54a21cbf08._comment b/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_1_139620857a275559b06fee54a21cbf08._comment new file mode 100644 index 0000000000..d34274f304 --- /dev/null +++ b/doc/todo/p2phttp__58___reuse_credentials_for_repos_on_one_host/comment_1_139620857a275559b06fee54a21cbf08._comment @@ -0,0 +1,10 @@ +[[!comment format=mdwn + username="matrss" + avatar="http://cdn.libravatar.org/avatar/59541f50d845e5f81aff06e88a38b9de" + subject="comment 1" + date="2024-11-15T08:54:07Z" + content=""" +Just an addendum: in forgejo-aneksajo I've effectively implemented the third option by having one git-annex-p2phttp endpoint for all repositories, peaking at the request to get the repository UUID, starting the p2phttp server for that repository, and then forwarding the request. So, having to enter the credentials for every new repository is no longer a concern there, and <https://git-annex.branchable.com/todo/p2phttp_serve_multiple_repositories/> would address this for standalone p2phttp. + +What might still be nice though is trying to reuse the credentials of standard git operations for p2phttp. In the case of forgejo-aneksajo, git push/pull and annex-p2phttp operations use the same username/password or username/access-token combination for authentication, but git-annex will prompt for them twice due to the different URLs. This might be a bit hacky, but I think this would just work if git-annex allowed plain http(s):// URLs in addition to annex+http(s):// in the annexurl configuration, as the request to git credential would then match that of plain git operations. +"""]]
diff --git a/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn b/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn new file mode 100644 index 0000000000..cdc9b6bf8b --- /dev/null +++ b/doc/forum/How_to_get_a_list_of_all_NOT_unused_files.mdwn @@ -0,0 +1,16 @@ +I have a research project where I want to save some but not all versions. Those that should be saved are tagged. I want to create a repository (and archive it) that contains only those files. It is, so to say, the inverse of --unused. + +I tried to clone a present repository to an new folder and move there only files that are referenced by some ref (branch or tag). + +But git annex does nothing: + +git clone my-repo repo-archive +cd repo-archive +git annex init +git annex copy --to=here --not --unused + +However, git annex exits without copying any files, my repo is still empty afterwards. + +I also tried git annex findkeys --not --unused, but it says invalid option --unused :-( + +How can I do this?
fixed
diff --git a/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_6_7cdffb27b1ab45fab71f1de19501f243._comment b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_6_7cdffb27b1ab45fab71f1de19501f243._comment new file mode 100644 index 0000000000..ebe1e01a77 --- /dev/null +++ b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_6_7cdffb27b1ab45fab71f1de19501f243._comment @@ -0,0 +1,11 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 6""" + date="2024-11-14T20:14:29Z" + content=""" +Fixed in [[!commit 4b87669ae229c89eadb4ff88eba927e105c003c4]]. Now it runs +in seconds. + +Note that this bug does not seem to affect S3 remotes that have versioning +enabled. +"""]]
comments
diff --git a/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_2_6f19e248752d4edfc36e84bb92a7348d._comment b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_2_6f19e248752d4edfc36e84bb92a7348d._comment new file mode 100644 index 0000000000..5104695d12 --- /dev/null +++ b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_2_6f19e248752d4edfc36e84bb92a7348d._comment @@ -0,0 +1,26 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 2""" + date="2024-11-14T18:23:54Z" + content=""" +No, it does not request versions from S3 when versioning is not enabled. + +This feels fairly similar to +[[git-annex-import_stalls_and_uses_all_ram_available]]. +But I don't think it's really the same, that one used versioning, and relied +on preferred content to filter the wanted files. + +Is the size of the whole bucket under the fileprefix, in your case, large +enough that storing a list of all the files (without the versions) could +logically take as much memory as you're seeing? At one point you said it +was 7k files, but later hundreds of thousands, so I'm confused about how +big it is. + +Is this bucket supposed to be public? I am having difficulty finding an +initremote command that works. + +It also seems quite possible, looking at the code, that it's keeping all +the responses from S3 in memory until it gets done with listing all the +files, which would further increase memory use. +I don't see any `O(N^2)` operations though. +"""]] diff --git a/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_3_b5b786e7ab8fa6c2fe80691033529b5b._comment b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_3_b5b786e7ab8fa6c2fe80691033529b5b._comment new file mode 100644 index 0000000000..0321f5b46d --- /dev/null +++ b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_3_b5b786e7ab8fa6c2fe80691033529b5b._comment @@ -0,0 +1,13 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 3""" + date="2024-11-14T18:50:37Z" + content=""" +This is the initremote for it: + + git-annex initremote dandiarchive type=S3 encryption=none fileprefix=dandisets/ bucket=dandiarchive publicurl=https://dandiarchive.s3.amazonaws.com/ signature=anonymous host=s3.amazonaws.com datacenter=US importtree=yes + +It started at 1 API call per second, but it slowed down as memory rapidly +went up. 3 gb in a few minutes, so I think there is definitely a memory +leak involved. +"""]] diff --git a/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_4_c57a8a4fbceb47965da3bf32ce502ed6._comment b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_4_c57a8a4fbceb47965da3bf32ce502ed6._comment new file mode 100644 index 0000000000..2c40a7e51a --- /dev/null +++ b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_4_c57a8a4fbceb47965da3bf32ce502ed6._comment @@ -0,0 +1,9 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 4""" + date="2024-11-14T19:05:48Z" + content=""" +I suspect one way the CLI tool is faster, aside from not leaking memory, +is that there is a max-key max-keys parameter that git-annex is not using. +Less pagination would speed it up. +"""]] diff --git a/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_5_1784657b6d59a7b7da71fdbb8dbcf61c._comment b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_5_1784657b6d59a7b7da71fdbb8dbcf61c._comment new file mode 100644 index 0000000000..fca257a787 --- /dev/null +++ b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_5_1784657b6d59a7b7da71fdbb8dbcf61c._comment @@ -0,0 +1,18 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 5""" + date="2024-11-14T19:21:33Z" + content=""" +Apparently gbrNextMarker is Nothing despite the response being truncted. So +git-annex is looping forever, getting the same first page each time, and +storing it all in a list. + +I think this is a bug in the aws library, or I'm using it wrong. +It looks for a NextMarker in the response XML, but accoccording to +<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjects.html> + +> This element is returned only if you have the delimiter request parameter +> specified. If the response does not include the NextMarker element and it is +> truncated, you can use the value of the last Key element in the response as the +> marker parameter in the subsequent request to get the next set of object keys. +"""]]
comment
diff --git a/doc/bugs/multiple_records_in_remote.log_for_the_same_remote/comment_2_b384a880f56dc9231233214b42e941a3._comment b/doc/bugs/multiple_records_in_remote.log_for_the_same_remote/comment_2_b384a880f56dc9231233214b42e941a3._comment new file mode 100644 index 0000000000..d0c79795b1 --- /dev/null +++ b/doc/bugs/multiple_records_in_remote.log_for_the_same_remote/comment_2_b384a880f56dc9231233214b42e941a3._comment @@ -0,0 +1,31 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 2""" + date="2024-11-14T17:57:56Z" + content=""" +Multiple names for the same uuid is easy to explain, if they ran `git-annex +renameremote`. Anyway, git-annex will use whichever of those configs for that +uuid has the latest timestamp. So not really a problem. And when the +remote.log gets compacted (as happened when you did "that dance"), the old +log entries get removed. + +Multiple uuids for the same name is also pretty easy to explain: +initremote can be run twice with the same name in different clones, +and so you then have two remotes upon merging. `git-annex enableremote` +does deal with this situation, failing with "Multiple remotes have that +name. Either use git-annex renameremote to rename them, or specify the uuid +of the remote." + +Here you didn't use enableremote though, but it autoenabled. Since both +remotes have autoenable set, I think what happened was whichever got +autoenabled second overwrote the git config of the one that got autoenabled +first. Here's how that looks: + + git-annex init + init (Auto enabling special remote foo...) + (Auto enabling special remote foo...) + ok + +Maybe autoenable could somehow handle that case better, but all I can think +of is a warning. +"""]]
close
diff --git a/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn b/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn index 01c34bb718..d167c8143d 100644 --- a/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn +++ b/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn @@ -30,3 +30,5 @@ It seems to be easy to check if versioning enabled: [[!meta author=yoh]] [[!tag projects/dandi]] + +> [[fixed|done]] --[[Joey]]
S3: Send git-annex or other configured User-Agent.
--user-agent is the only way to configure it currently
(Needs aws-0.24.3)
--user-agent is the only way to configure it currently
(Needs aws-0.24.3)
diff --git a/CHANGELOG b/CHANGELOG index 46fc1d4470..24f1d162a9 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -5,6 +5,8 @@ git-annex (10.20241032) UNRELEASED; urgency=medium * vpop: Only update state after successful checkout. * S3: Support versioning=yes with a readonly bucket. (Needs aws-0.24.3) + * S3: Send git-annex or other configured User-Agent. + (Needs aws-0.24.3) -- Joey Hess <id@joeyh.name> Mon, 11 Nov 2024 12:26:00 -0400 diff --git a/Remote/S3.hs b/Remote/S3.hs index 2f5dc17b1f..b7d13f11f5 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -63,8 +63,8 @@ import Utility.Metered import Utility.DataUnits import Annex.Content import qualified Annex.Url as Url -import Utility.Url (extractFromResourceT) -import Annex.Url (getUrlOptions, withUrlOptions, UrlOptions(..)) +import Utility.Url (extractFromResourceT, UserAgent) +import Annex.Url (getUserAgent, getUrlOptions, withUrlOptions, UrlOptions(..)) import Utility.Env import Annex.Verify @@ -885,10 +885,11 @@ mkS3HandleVar c gc u = liftIO $ newTVarIO $ Left $ Just creds -> go =<< liftIO (genCredentials creds) Nothing -> return (Left S3HandleNeedCreds) where - s3cfg = s3Configuration c go awscreds = do - let awscfg = AWS.Configuration AWS.Timestamp awscreds debugMapper Nothing ou <- getUrlOptions + ua <- getUserAgent + let awscfg = AWS.Configuration AWS.Timestamp awscreds debugMapper Nothing + let s3cfg = s3Configuration (Just ua) c return $ Right $ S3Handle (httpManager ou) awscfg s3cfg withS3Handle :: S3HandleVar -> (Either S3HandleProblem S3Handle -> Annex a) -> Annex a @@ -907,13 +908,20 @@ withS3HandleOrFail u hv a = withS3Handle hv $ \case needS3Creds :: UUID -> String needS3Creds u = missingCredPairFor "S3" (AWS.creds u) -s3Configuration :: ParsedRemoteConfig -> S3.S3Configuration AWS.NormalQuery -s3Configuration c = cfg +s3Configuration :: Maybe UserAgent -> ParsedRemoteConfig -> S3.S3Configuration AWS.NormalQuery +#if MIN_VERSION_aws(0,24,3) +s3Configuration ua c = cfg +#else +s3Configuration _ua c = cfg +#endif { S3.s3Port = port , S3.s3RequestStyle = case getRemoteConfigValue requeststyleField c of Just "path" -> S3.PathStyle Just s -> giveup $ "bad S3 requeststyle value: " ++ s Nothing -> S3.s3RequestStyle cfg +#if MIN_VERSION_aws(0,24,3) + , S3.s3UserAgent = T.pack <$> ua +#endif } where h = fromJust $ getRemoteConfigValue hostField c @@ -1157,7 +1165,7 @@ s3Info c info = catMaybes , Just ("versioning", if versioning info then "yes" else "no") ] where - s3c = s3Configuration c + s3c = s3Configuration Nothing c showstorageclass (S3.OtherStorageClass t) = T.unpack t showstorageclass sc = show sc diff --git a/doc/bugs/User-Agent_not_sent_with_S3_remote.mdwn b/doc/bugs/User-Agent_not_sent_with_S3_remote.mdwn index b270f8163c..e9500a7990 100644 --- a/doc/bugs/User-Agent_not_sent_with_S3_remote.mdwn +++ b/doc/bugs/User-Agent_not_sent_with_S3_remote.mdwn @@ -1,3 +1,5 @@ ### Please describe the problem. git-annex does not appear to send a User-Agent when used with an S3 remote. + +> [[fixed|done]] --[[Joey]] diff --git a/doc/bugs/User-Agent_not_sent_with_S3_remote/comment_2_18f895b9e38908faecfc0a55a802fdd3._comment b/doc/bugs/User-Agent_not_sent_with_S3_remote/comment_2_18f895b9e38908faecfc0a55a802fdd3._comment new file mode 100644 index 0000000000..96000d75d8 --- /dev/null +++ b/doc/bugs/User-Agent_not_sent_with_S3_remote/comment_2_18f895b9e38908faecfc0a55a802fdd3._comment @@ -0,0 +1,13 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 2""" + date="2024-11-13T19:52:28Z" + content=""" +Drat, no followup. I seem to remember hearing about a S3 implementation +that either needed any User-Agent header (currently, git-annex does not +send one to S3), or perhaps a specific User-Agent. But I don't remember +details. + +Anyway, I have implemented a patch to the aws library that can be used for +this. +"""]] diff --git a/doc/git-annex-common-options.mdwn b/doc/git-annex-common-options.mdwn index 37e2e0aaf7..9e0003fb65 100644 --- a/doc/git-annex-common-options.mdwn +++ b/doc/git-annex-common-options.mdwn @@ -133,7 +133,8 @@ Most of these options are accepted by all git-annex commands. * `--user-agent=value` - Overrides the User-Agent to use when downloading files from the web. + Overrides the User-Agent to use when downloading files from the web, + or otherwise accessing web services. * `--notify-finish`
response
diff --git a/doc/forum/Clusters_-_what__39__s_the_use_case__63__/comment_1_b018aa684ad3c17d7f392acb41dc8d8c._comment b/doc/forum/Clusters_-_what__39__s_the_use_case__63__/comment_1_b018aa684ad3c17d7f392acb41dc8d8c._comment new file mode 100644 index 0000000000..e827505304 --- /dev/null +++ b/doc/forum/Clusters_-_what__39__s_the_use_case__63__/comment_1_b018aa684ad3c17d7f392acb41dc8d8c._comment @@ -0,0 +1,33 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2024-11-13T18:15:22Z" + content=""" +I think that the most important use case for a cluster is +that it lets you make a single upload of a file and let the cluster decide +which nodes to store it on. +Rather than needing to upload several times to different repositories. + +Being able to `git-annex drop foo --from cluster` and remove it from whichever +nodes the cluster happens to be storing it on is also a convenient use case. + +Another use case is letting the cluster operator provide access to some +resource that you can't directly access. (Although that is really a property +of proxies more generally.) Eg, a cluster node could be a S3 bucket that +only the cluster operator has credentials to access. + +So it's an abstraction, but it's intentionally a leaky one; you can still +access specific nodes of a cluster individually via the cluster gateway. Eg +you can use "cluster-node1" as a remote. But also, with physical access, +you could pull that node1 drive out of the cluster, make it a regular +remote, and git-annex will know what files are on it. And when you put it +back in the cluster, it will know what changes you made to it. And when you +consider that situation, I think it makes sense why the contents of each +node are tracked individually:, which is why node counts as its own copy. + +It's also possible that git-annex will eventually not be limited to +"1 repository is 1 copy" more generally. +[[todo/repositories_that_count_as_more_than_one_copy]] is pondering +that, and has some other cases than clusters where it may make sense, +to some people, to have a repository be treated as more than 1 copy. +"""]]
comment
diff --git a/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_4_17750cd497243aa33b570875e73d06fb._comment b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_4_17750cd497243aa33b570875e73d06fb._comment new file mode 100644 index 0000000000..15a24cb02d --- /dev/null +++ b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_4_17750cd497243aa33b570875e73d06fb._comment @@ -0,0 +1,14 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 4""" + date="2024-11-13T18:02:52Z" + content=""" +I think the bug here is that git-annex doesn't make clear it's treating +that as a dotfile. + +The docs were unclear to some, as mentioned in that other bug report, +and I've clarified them. + +But also, I made `git-annex add` display "dotfile; adding content to git +repository" +"""]]
Added a comment
diff --git a/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_4_0aa887dcfdbb9e0b6eaee3ff01d545bb._comment b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_4_0aa887dcfdbb9e0b6eaee3ff01d545bb._comment new file mode 100644 index 0000000000..38d4798ca0 --- /dev/null +++ b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_4_0aa887dcfdbb9e0b6eaee3ff01d545bb._comment @@ -0,0 +1,8 @@ +[[!comment format=mdwn + username="yarikoptic" + avatar="http://cdn.libravatar.org/avatar/f11e9c84cb18d26a1748c33b48c924b4" + subject="comment 4" + date="2024-11-13T18:11:58Z" + content=""" +Thank you! FWIW confirming that doing `git annex config --set annex.dotfiles true` made subsequent `git annex add` add that file under git-annex. Filed a [FAQ documentation for con/duct](https://github.com/con/duct/pull/225) in hope to take longer this time to forget about this setting ;-) +"""]]
add: When adding a dotfile as a non-large file, mention that it's a dotfile
This is to reduce user confusion when their annex.largefiles matches it,
or is not set.
Note that, when annex.dotfiles is set, but a dotfile is not matched by
annex.largefiles, the "non-large file" message will be displayed. That
makes sense because whether the file is a dotfile does not matter with that
configuration.
Also, this slightly optimised the annex.dotfiles path in passing,
by avoiding the slight slowdown caused by the check added in commit
876d5b6c6fed7d540767783eb3d14469f41bd1c2 in that case.
This is to reduce user confusion when their annex.largefiles matches it,
or is not set.
Note that, when annex.dotfiles is set, but a dotfile is not matched by
annex.largefiles, the "non-large file" message will be displayed. That
makes sense because whether the file is a dotfile does not matter with that
configuration.
Also, this slightly optimised the annex.dotfiles path in passing,
by avoiding the slight slowdown caused by the check added in commit
876d5b6c6fed7d540767783eb3d14469f41bd1c2 in that case.
diff --git a/CHANGELOG b/CHANGELOG index 91d3ca294d..d87fa5a3d3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -5,6 +5,8 @@ git-annex (10.20241032) UNRELEASED; urgency=medium * vpop: Only update state after successful checkout. * add: Consistently treat files in a dotdir as dotfiles, even when ran inside that dotdir. + * add: When adding a dotfile as a non-large file, mention that it's a + dotfile. -- Joey Hess <id@joeyh.name> Mon, 11 Nov 2024 12:26:00 -0400 diff --git a/Command/Add.hs b/Command/Add.hs index ff60cd4370..ef5853126f 100644 --- a/Command/Add.hs +++ b/Command/Add.hs @@ -95,18 +95,20 @@ seek' o = do annexdotfiles <- getGitConfigVal annexDotFiles let gofile includingsmall (si, file) = case largeFilesOverride o of Nothing -> do - topfile <- getTopFilePath <$> inRepo (toTopFilePath file) - ifM (pure (annexdotfiles || not (dotfile topfile)) - <&&> (checkFileMatcher NoLiveUpdate largematcher file - <||> Annex.getRead Annex.force)) - ( start dr si file addunlockedmatcher - , if includingsmall + isdotfile <- if annexdotfiles + then pure False + else dotfile . getTopFilePath + <$> inRepo (toTopFilePath file) + islarge <- checkFileMatcher NoLiveUpdate largematcher file + <||> Annex.getRead Annex.force + if (not isdotfile && islarge) + then start dr si file addunlockedmatcher + else if includingsmall then ifM (annexAddSmallFiles <$> Annex.getGitConfig) - ( startSmall dr si file + ( startSmall isdotfile dr si file , stop ) else stop - ) Just True -> start dr si file addunlockedmatcher Just False -> startSmallOverridden dr si file case batchOption o of @@ -138,17 +140,18 @@ seek' o = do dr = dryRunOption o {- Pass file off to git-add. -} -startSmall :: DryRun -> SeekInput -> RawFilePath -> CommandStart -startSmall dr si file = +startSmall :: Bool -> DryRun -> SeekInput -> RawFilePath -> CommandStart +startSmall isdotfile dr si file = liftIO (catchMaybeIO $ R.getSymbolicLinkStatus file) >>= \case Just s -> starting "add" (ActionItemTreeFile file) si $ - addSmall dr file s + addSmall isdotfile dr file s Nothing -> stop -addSmall :: DryRun -> RawFilePath -> FileStatus -> CommandPerform -addSmall dr file s = do - showNote "non-large file; adding content to git repository" +addSmall :: Bool -> DryRun -> RawFilePath -> FileStatus -> CommandPerform +addSmall isdotfile dr file s = do + showNote $ (if isdotfile then "dotfile" else "non-large file") + <> "; adding content to git repository" skipWhenDryRun dr $ next $ addFile Small file s startSmallOverridden :: DryRun -> SeekInput -> RawFilePath -> CommandStart diff --git a/Command/AddUrl.hs b/Command/AddUrl.hs index 6be663b949..7feb0b19eb 100644 --- a/Command/AddUrl.hs +++ b/Command/AddUrl.hs @@ -519,7 +519,7 @@ addWorkTree _ addunlockedmatcher u url file key mtmp = case mtmp of -- than the work tree file. liftIO $ moveFile file tmp go - else Command.Add.addSmall (DryRun False) file s + else Command.Add.addSmall False (DryRun False) file s >>= maybe noop void where go = do diff --git a/Command/Import.hs b/Command/Import.hs index c7496aeebf..f06543bb7e 100644 --- a/Command/Import.hs +++ b/Command/Import.hs @@ -261,7 +261,7 @@ startLocal o addunlockedmatcher largematcher mode (srcfile, destfile) = >>= maybe stop (\addedk -> next $ Command.Add.cleanup addedk True) - , Command.Add.addSmall (DryRun False) destfile s + , Command.Add.addSmall False (DryRun False) destfile s ) notoverwriting why = do warning $ "not overwriting existing " <> QuotedPath destfile <> " " <> UnquotedString why diff --git a/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__.mdwn b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__.mdwn index 9925906fd9..b011766ae3 100644 --- a/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__.mdwn +++ b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__.mdwn @@ -62,4 +62,4 @@ How to remedy? dandi project -> Not a bug, [[done]] --[[Joey]] +> [[fixed|done]] --[[Joey]] diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index e0cae8c02b..178ad146d3 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -1009,8 +1009,8 @@ repository, using [[git-annex-config]]. See its man page for a list.) * `annex.addsmallfiles` - Controls whether small files (not matching annex.largefiles) - should be checked into git by `git annex add`. Defaults to true; + Controls whether small files (files not matching annex.largefiles, or + dotfiles) should be checked into git by `git annex add`. Defaults to true; set to false to instead make small files be skipped. * `annex.addunlocked`
Added a comment
diff --git a/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_1_cd7b1bc28a000b262e11c2b89aeed0da._comment b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_1_cd7b1bc28a000b262e11c2b89aeed0da._comment new file mode 100644 index 0000000000..3c58d06076 --- /dev/null +++ b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix/comment_1_cd7b1bc28a000b262e11c2b89aeed0da._comment @@ -0,0 +1,29 @@ +[[!comment format=mdwn + username="yarikoptic" + avatar="http://cdn.libravatar.org/avatar/f11e9c84cb18d26a1748c33b48c924b4" + subject="comment 1" + date="2024-11-13T18:08:59Z" + content=""" +At the end (after over a day of torturing that poor bucket, whenever it took just few minutes for `s3cmd sync` to get everything including content) it crashed with + +``` +[2024-11-12 22:58:00.366878941] (Remote.S3) Response status: Status {statusCode = 200, statusMessage = \"OK\"} +[2024-11-12 22:58:00.373456754] (Remote.S3) Response header 'x-amz-id-2': 'DGXJztoRJRuHQrcOqs3FtnEUJomRz+53jawFoKoRbKQATcvAppqJcfcAVfR1d8cu7uepkEDvSXo=' +[2024-11-12 22:58:00.384304583] (Remote.S3) Response header 'x-amz-request-id': 'W1PSPV7ZSBKJ7HTT' +[2024-11-12 22:58:00.38437407] (Remote.S3) Response header 'Date': 'Wed, 13 Nov 2024 03:50:18 GMT' +[2024-11-12 22:58:00.384436037] (Remote.S3) Response header 'x-amz-bucket-region': 'us-east-2' +[2024-11-12 22:58:00.384486611] (Remote.S3) Response header 'Content-Type': 'application/xml' +[2024-11-12 22:58:00.384533794] (Remote.S3) Response header 'Transfer-Encoding': 'chunked' +[2024-11-12 22:58:00.384581117] (Remote.S3) Response header 'Server': 'AmazonS3' + +git-annex: Unable to list contents of s3-dandiarchive: Network.Socket.recvBuf: resource vanished (Connection reset by peer) +failed +[2024-11-12 22:58:00.565431711] (Utility.Process) process [3912839] done ExitSuccess +import: 1 failed + +``` + +attesting that it is doing something unnecessary -- either listing full bucket (unlikely) or listing all versions of keys under the prefix (e.g. using [ListObjectVersions](https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectVersions.html) instead of [ListObjectsV2](https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html)). + +It would have been useful if logs included the API call involved here. +"""]]
Added a comment
diff --git a/doc/bugs/add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles/comment_5_832824ac8722238ed00fc0604ff394e0._comment b/doc/bugs/add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles/comment_5_832824ac8722238ed00fc0604ff394e0._comment new file mode 100644 index 0000000000..7fb234fd5a --- /dev/null +++ b/doc/bugs/add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles/comment_5_832824ac8722238ed00fc0604ff394e0._comment @@ -0,0 +1,13 @@ +[[!comment format=mdwn + username="yarikoptic" + avatar="http://cdn.libravatar.org/avatar/f11e9c84cb18d26a1748c33b48c924b4" + subject="comment 5" + date="2024-11-13T17:57:58Z" + content=""" +@lell, would you be so kind to point to me to \"formalization\" you use for your containers/configuration layout? +\"I like containers too\", and in neuro* domain we are formalizing layout of data etc on the filesystem in [BIDS standard](https://bids.neuroimaging.io/). +Moreover we are trying to formalize at the level of the \"entire project\", see e.g. [this issue with examples](https://github.com/bids-standard/bids-specification/pull/1861#issuecomment-2183701293). +I would be interested to learn what/how you do it. +Feel welcome to reach out directly to e.g. `debian AT oneukrainian.com`. +Cheers, +"""]]
add: Consistently treat files in a dotdir as dotfiles, even when ran inside that dotdir
Assistant and smudge also updated.
This does add a small amount of extra work, getting the TopFilePath.
Not enough to be concerned by.
Also improve documentation to make clear that files inside dotdirs are
treated as dotfiles.
Sponsored-by: Eve on Patreon
Assistant and smudge also updated.
This does add a small amount of extra work, getting the TopFilePath.
Not enough to be concerned by.
Also improve documentation to make clear that files inside dotdirs are
treated as dotfiles.
Sponsored-by: Eve on Patreon
diff --git a/Assistant/Threads/Committer.hs b/Assistant/Threads/Committer.hs index 2f7e03c43c..85692767e7 100644 --- a/Assistant/Threads/Committer.hs +++ b/Assistant/Threads/Committer.hs @@ -45,6 +45,7 @@ import qualified Git.Branch import Utility.Tuple import Utility.Metered import qualified Utility.RawFilePath as R +import Git.FilePath import Data.Time.Clock import qualified Data.Set as S @@ -319,15 +320,19 @@ handleAdds lockdowndir havelsof largefilematcher annexdotfiles delayadd cs = ret (LinkChange (Just key)) checksmall change - | not annexdotfiles && dotfile f = - return (Right change) - | otherwise = - ifM (liftAnnex $ checkFileMatcher NoLiveUpdate largefilematcher f) - ( return (Left change) - , return (Right change) - ) + | not annexdotfiles = do + topfile <- liftAnnex $ + getTopFilePath <$> inRepo (toTopFilePath f) + if dotfile topfile + then return (Right change) + else checkmatcher + | otherwise = checkmatcher where f = toRawFilePath (changeFile change) + checkmatcher = ifM (liftAnnex $ checkFileMatcher NoLiveUpdate largefilematcher f) + ( return (Left change) + , return (Right change) + ) addsmall [] = noop addsmall toadd = liftAnnex $ void $ tryIO $ diff --git a/CHANGELOG b/CHANGELOG index 1fda614aa9..91d3ca294d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,6 +3,8 @@ git-annex (10.20241032) UNRELEASED; urgency=medium * git-remote-annex: Fix a reversion introduced in version 10.20241031 that broke cloning from a special remote. * vpop: Only update state after successful checkout. + * add: Consistently treat files in a dotdir as dotfiles, even + when ran inside that dotdir. -- Joey Hess <id@joeyh.name> Mon, 11 Nov 2024 12:26:00 -0400 diff --git a/Command/Add.hs b/Command/Add.hs index f42008f18b..ff60cd4370 100644 --- a/Command/Add.hs +++ b/Command/Add.hs @@ -94,17 +94,19 @@ seek' o = do addunlockedmatcher <- addUnlockedMatcher annexdotfiles <- getGitConfigVal annexDotFiles let gofile includingsmall (si, file) = case largeFilesOverride o of - Nothing -> ifM (pure (annexdotfiles || not (dotfile file)) - <&&> (checkFileMatcher NoLiveUpdate largematcher file - <||> Annex.getRead Annex.force)) - ( start dr si file addunlockedmatcher - , if includingsmall - then ifM (annexAddSmallFiles <$> Annex.getGitConfig) - ( startSmall dr si file - , stop - ) - else stop - ) + Nothing -> do + topfile <- getTopFilePath <$> inRepo (toTopFilePath file) + ifM (pure (annexdotfiles || not (dotfile topfile)) + <&&> (checkFileMatcher NoLiveUpdate largematcher file + <||> Annex.getRead Annex.force)) + ( start dr si file addunlockedmatcher + , if includingsmall + then ifM (annexAddSmallFiles <$> Annex.getGitConfig) + ( startSmall dr si file + , stop + ) + else stop + ) Just True -> start dr si file addunlockedmatcher Just False -> startSmallOverridden dr si file case batchOption o of diff --git a/Command/Smudge.hs b/Command/Smudge.hs index 57ab8ff8dd..89f637dd52 100644 --- a/Command/Smudge.hs +++ b/Command/Smudge.hs @@ -239,12 +239,14 @@ shouldAnnex file indexmeta moldkey = do , checkunchanged checkwasannexed ) where - checkmatcher d - | dotfile file = ifM (getGitConfigVal annexDotFiles) - ( go - , d - ) - | otherwise = go + checkmatcher d = do + topfile <- getTopFilePath <$> inRepo (toTopFilePath file) + if dotfile topfile + then ifM (getGitConfigVal annexDotFiles) + ( go + , d + ) + else go where go = do matcher <- largeFilesMatcher diff --git a/doc/bugs/add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles.mdwn b/doc/bugs/add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles.mdwn index 353fbdd1dd..ff54c26e7f 100644 --- a/doc/bugs/add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles.mdwn +++ b/doc/bugs/add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles.mdwn @@ -101,3 +101,7 @@ go-to solution for “want something versioned, but can't store the contents themselves (too big, too sensitive, etc.)?”. Furthermore, git-annex documentation in general is excellent. But that is also why I'm stumped that the manual is so silent on this point. + +> [[fixed|done]] by resolving inconsistent behavior. Also improved +> documentation to be clear that dot directories are treated same as +> dotfiles. diff --git a/doc/bugs/add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles/comment_5_c9206d1a0c74149df970d44025160d89._comment b/doc/bugs/add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles/comment_5_c9206d1a0c74149df970d44025160d89._comment new file mode 100644 index 0000000000..b52a2b54e1 --- /dev/null +++ b/doc/bugs/add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles/comment_5_c9206d1a0c74149df970d44025160d89._comment @@ -0,0 +1,21 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 5""" + date="2024-11-13T17:04:59Z" + content=""" +> Why can't git-annex just handle the .git folder differently and for all others just annex or not as set in the largefile rules? + +Because creating a .gitignore followed by `git-annex add` would then blow the +user's foot off. And this would be a very common foot-shooting opportunity, +and .gitignore is only the perhaps most common trigger for it. + +Files in dot directories are generally less common, outside of course of +.git and $HOME. Which is the only reason I'm willing to consider changing +the dotfiles handling to not include those. + +But, .config/ seems to me to perfectly match what dotfiles *are*, which is +files that are configuration that are named with a name starting with a +dot in order to keep them from cluttering up `ls`. Just because in your use +case you don't want to check those into git as dotfiles does not seem like +a good argument for git-annex to not treat them as dotfiles by default. +"""]] diff --git a/doc/bugs/add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles/comment_6_eb82a22ffe512bf0d6f2e7841ce022f0._comment b/doc/bugs/add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles/comment_6_eb82a22ffe512bf0d6f2e7841ce022f0._comment new file mode 100644 index 0000000000..985237c16d --- /dev/null +++ b/doc/bugs/add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles/comment_6_eb82a22ffe512bf0d6f2e7841ce022f0._comment @@ -0,0 +1,9 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 6""" + date="2024-11-13T17:14:47Z" + content=""" +Revisiting this, it seems best to fix the inconsistent behavior by +having git-annex get the path to the file relative to the top of the git +repository, and check if there's a dot directory in the path. +"""]] diff --git a/doc/git-annex-add.mdwn b/doc/git-annex-add.mdwn index a4b6d95208..6313008e06 100644 --- a/doc/git-annex-add.mdwn +++ b/doc/git-annex-add.mdwn @@ -18,10 +18,10 @@ git has been configured to ignore will be silently skipped. If annex.largefiles is configured (in git config, gitattributes, or git-annex config), and does not match a file, `git annex add` will behave the same as `git add` and add the non-large file directly to the git -repository, instead of to the annex. (By default dotfiles are assumed to -not be large, and are added directly to git, but annex.dotfiles can be -configured to annex those too.) See the git-annex manpage for documentation -of these and other configuration settings. +repository, instead of to the annex. (By default dotfiles and the contents +of dotdirs) are assumed to not be large, and are added directly to git, but +annex.dotfiles can be configured to annex those too.) See the git-annex +manpage for documentation of these and other configuration settings. By default, large files are added to the annex in locked form, which prevents further modification of their content until diff --git a/doc/git-annex-config.mdwn b/doc/git-annex-config.mdwn index f52f4a2a4a..a4a1b4ddac 100644 --- a/doc/git-annex-config.mdwn +++ b/doc/git-annex-config.mdwn @@ -81,8 +81,8 @@ looks for these. This configures the behavior of both git-annex and git when adding files to the repository. By default, `git-annex add` adds all files - to the annex (except dotfiles), and `git add` adds files to git - (unless they were added to the annex previously). + to the annex (except dotfiles and files in dotdirs), and + `git add` adds files to git (unless they were added to the annex previously). When annex.largefiles is configured, both `git annex add` and `git add` will add matching large files to the annex, and the other files to git. (Diff truncated)
fix link
diff --git a/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_3_14cd923a1044a09034d13e3656c9af4b._comment b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_3_14cd923a1044a09034d13e3656c9af4b._comment index 644b4b56ea..6003db3c48 100644 --- a/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_3_14cd923a1044a09034d13e3656c9af4b._comment +++ b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_3_14cd923a1044a09034d13e3656c9af4b._comment @@ -4,6 +4,6 @@ date="2024-11-13T16:30:55Z" content=""" Oh, this is a dotfile. It is behaving as documented for dotfiles. -(Modulo the issue discussed in [[add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles]]) +(Modulo the issue discussed in <https://git-annex.branchable.com/bugs/add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles/>) You will need to set annex.dotfiles. """]]
dotfile
diff --git a/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__.mdwn b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__.mdwn index 4b4a092ffe..9925906fd9 100644 --- a/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__.mdwn +++ b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__.mdwn @@ -61,3 +61,5 @@ add .duct/logs/2024.10.30T14.59.27-418623_stdout (non-large file; adding content How to remedy? dandi project + +> Not a bug, [[done]] --[[Joey]] diff --git a/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_3_14cd923a1044a09034d13e3656c9af4b._comment b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_3_14cd923a1044a09034d13e3656c9af4b._comment new file mode 100644 index 0000000000..644b4b56ea --- /dev/null +++ b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_3_14cd923a1044a09034d13e3656c9af4b._comment @@ -0,0 +1,9 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 3""" + date="2024-11-13T16:30:55Z" + content=""" +Oh, this is a dotfile. It is behaving as documented for dotfiles. +(Modulo the issue discussed in [[add__58___inconsistently_treats_files_in_dotdirs_as_dotfiles]]) +You will need to set annex.dotfiles. +"""]]
Added a comment
diff --git a/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_1_3018433b7d2fb02f68a3acb21583904e._comment b/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_1_3018433b7d2fb02f68a3acb21583904e._comment new file mode 100644 index 0000000000..97acc76130 --- /dev/null +++ b/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_1_3018433b7d2fb02f68a3acb21583904e._comment @@ -0,0 +1,8 @@ +[[!comment format=mdwn + username="datamanager" + avatar="http://cdn.libravatar.org/avatar/7d4ca7c5e571d4740ef072b83a746c12" + subject="comment 1" + date="2024-11-13T00:30:24Z" + content=""" + I have this problem as well, on my oneplus 12. Have you found a solution? +"""]]
removed
diff --git a/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_2_26c4fb69c9f04d93657f31a19b8a4938._comment b/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_2_26c4fb69c9f04d93657f31a19b8a4938._comment deleted file mode 100644 index 039e37d7c5..0000000000 --- a/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_2_26c4fb69c9f04d93657f31a19b8a4938._comment +++ /dev/null @@ -1,8 +0,0 @@ -[[!comment format=mdwn - username="datamanager" - avatar="http://cdn.libravatar.org/avatar/7d4ca7c5e571d4740ef072b83a746c12" - subject="comment 2" - date="2024-11-13T00:29:14Z" - content=""" -I have this problem as well, on my oneplus 12, using fdroid. Have you found a solution? -"""]]
removed
diff --git a/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_1_03844677ad292a9e303ed1083dd19a17._comment b/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_1_03844677ad292a9e303ed1083dd19a17._comment deleted file mode 100644 index 3ca931c457..0000000000 --- a/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_1_03844677ad292a9e303ed1083dd19a17._comment +++ /dev/null @@ -1,8 +0,0 @@ -[[!comment format=mdwn - username="datamanager" - avatar="http://cdn.libravatar.org/avatar/7d4ca7c5e571d4740ef072b83a746c12" - subject="comment 1" - date="2024-11-13T00:29:04Z" - content=""" -I have this problem as well, on my oneplus one, using fdroid. Have you found a solution? -"""]]
Added a comment
diff --git a/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_2_26c4fb69c9f04d93657f31a19b8a4938._comment b/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_2_26c4fb69c9f04d93657f31a19b8a4938._comment new file mode 100644 index 0000000000..039e37d7c5 --- /dev/null +++ b/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_2_26c4fb69c9f04d93657f31a19b8a4938._comment @@ -0,0 +1,8 @@ +[[!comment format=mdwn + username="datamanager" + avatar="http://cdn.libravatar.org/avatar/7d4ca7c5e571d4740ef072b83a746c12" + subject="comment 2" + date="2024-11-13T00:29:14Z" + content=""" +I have this problem as well, on my oneplus 12, using fdroid. Have you found a solution? +"""]]
Added a comment
diff --git a/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_1_03844677ad292a9e303ed1083dd19a17._comment b/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_1_03844677ad292a9e303ed1083dd19a17._comment new file mode 100644 index 0000000000..3ca931c457 --- /dev/null +++ b/doc/bugs/install_on_android_boox__58___xargs_Permission_denied/comment_1_03844677ad292a9e303ed1083dd19a17._comment @@ -0,0 +1,8 @@ +[[!comment format=mdwn + username="datamanager" + avatar="http://cdn.libravatar.org/avatar/7d4ca7c5e571d4740ef072b83a746c12" + subject="comment 1" + date="2024-11-13T00:29:04Z" + content=""" +I have this problem as well, on my oneplus one, using fdroid. Have you found a solution? +"""]]
diff --git a/doc/forum/Clusters_-_what__39__s_the_use_case__63__.mdwn b/doc/forum/Clusters_-_what__39__s_the_use_case__63__.mdwn new file mode 100644 index 0000000000..dbdf2ba2ff --- /dev/null +++ b/doc/forum/Clusters_-_what__39__s_the_use_case__63__.mdwn @@ -0,0 +1,11 @@ +Are there any discussions of the use case for clusters other than the [tips link](https://git-annex.branchable.com/tips/clusters/)? + +I thought I understood the use case: + +> a collection of git-annex repositories which are combined to form a single logical repository + +So, one (or more) front-end and N-backends all behaving as a single repository. + +But then I saw multiple copies on a cluster count as multiple copies. This breaks my understanding of "one copy in a repository" logic. + +Are there any other discussions of when to use/create a cluster?
initial report on annex import slowing to crawl on dandiarchive/dandisets/
diff --git a/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix.mdwn b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix.mdwn new file mode 100644 index 0000000000..8f4723e67d --- /dev/null +++ b/doc/bugs/importtree_from_S3_slows_to_halt_even_with_prefix.mdwn @@ -0,0 +1,72 @@ +### Please describe the problem. + +I have been running + +`git annex --debug import --from s3-dandiarchive master` + +from an S3 bucket which is versioned but I did not enable versioning for this "import" case (due to [git-annex unable to sense versioning read-only](https://git-annex.branchable.com/bugs/importtree_with_versioning__61__yes__58___check_first/)) and expected it to "quickly" import tree (with about 7k files) from S3. Note that some of the keys have **many** older revisions for one reason or another. + +But currently that process, started hours ago yesterday IIRC, is + +``` + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND +3912831 dandi 20 0 1024.1g 51.7g 16000 S 100.0 82.4 19,48 git-annex +``` + +CPU heavy and very slow (now, started faster flipping through pages) on actually "importing" while listing a page every 30 seconds or so + +``` +[2024-11-12 14:59:23.587433059] (Remote.S3) Header: [("Date","Tue, 12 Nov 2024 19:59:23 GMT")] + +[2024-11-12 14:59:58.073945529] (Remote.S3) Response status: Status {statusCode = 200, statusMessage = "OK"} +[2024-11-12 14:59:58.074057102] (Remote.S3) Response header 'x-amz-id-2': 'sxDUdIkuRLs3jjjTyIbFaI+cQqLCGpTXZNFcvykT2+F6OcqVRM2IMn6P1YquVrdH3fXmV9nRnTDs9EtOtctV05GptcIaBaF2' +[2024-11-12 14:59:58.07410232] (Remote.S3) Response header 'x-amz-request-id': 'Y35X1Z41GMF9PHY8' +[2024-11-12 14:59:58.074135941] (Remote.S3) Response header 'Date': 'Tue, 12 Nov 2024 19:59:24 GMT' +[2024-11-12 14:59:58.074167094] (Remote.S3) Response header 'x-amz-bucket-region': 'us-east-2' +[2024-11-12 14:59:58.074197609] (Remote.S3) Response header 'Content-Type': 'application/xml' +[2024-11-12 14:59:58.074228873] (Remote.S3) Response header 'Transfer-Encoding': 'chunked' +[2024-11-12 14:59:58.074259342] (Remote.S3) Response header 'Server': 'AmazonS3' +[2024-11-12 14:59:58.171273277] (Remote.S3) String to sign: "GET\n\n\nTue, 12 Nov 2024 19:59:58 GMT\n/dandiarchive/" +[2024-11-12 14:59:58.171355688] (Remote.S3) Host: "dandiarchive.s3.amazonaws.com" +[2024-11-12 14:59:58.17139206] (Remote.S3) Path: "/" +[2024-11-12 14:59:58.17142278] (Remote.S3) Query string: "prefix=dandisets%2F" +[2024-11-12 14:59:58.171463294] (Remote.S3) Header: [("Date","Tue, 12 Nov 2024 19:59:58 GMT")] + +``` + +and not sure how many pages it got so far. + +I suspect (can't tell from above) that it is using API to list all versions of keys, not just current version, even though I have not asked for versioned support. + +Note: bucket is too heavy (about 300 million keys IIRC) to list all of it for all the versions. I do not have information ready on how many versions of keys in the `dandisets/` prefix - could be some hundreds of thousands, but I would still expect/hope it to complete by now. Nothing seems to be done on filesystem or to git store yet (du says it is 280k total size) -- git-annex is just being fed information from S3. + +### What steps will reproduce the problem? + +- add s3 importtree special remote matching + +``` +bucket=dandiarchive datacenter=US encryption=none fileprefix=dandisets/ host=s3.amazonaws.com importtree=yes name=s3-dandiarchive port=80 publicurl=https://dandiarchive.s3.amazonaws.com/ signature=anonymous storageclass=STANDARD type=S3 timestamp=1731015643s +``` + +- run `annex import` from it + + +### What version of git-annex are you using? On what operating system? + +invocation of `static-git-annex-10.20241031` (build by kyleam https://git.kyleam.com/static-annex/ ... but I think I tried a different one before): + +```shell +(dandisets-2) dandi@drogon:/mnt/backup/dandi/dandiset-manifests$ /home/dandi/git-annexes/static-git-annex-10.20241031/bin/git-annex version +git-annex version: 10.20241031 +build flags: Pairing DBus DesktopNotify TorrentParser MagicMime Servant Benchmark Feeds Testsuite S3 WebDAV +dependency versions: aws-0.24.2 bloomfilter-2.0.1.2 crypton-1.0.1 DAV-1.3.4 feed-1.3.2.1 ghc-9.8.3 http-client-0.7.17 persistent-sqlite-2.13.3.0 torrent-10000.1.3 uuid-1.3.16 +key/value backends: SHA256E SHA256 SHA512E SHA512 SHA224E SHA224 SHA384E SHA384 SHA3_256E SHA3_256 SHA3_512E SHA3_512 SHA3_224E SHA3_224 SHA3_384E SHA3_384 SKEIN256E SKEIN256 SKEIN512E SKEIN512 BLAKE2B256E BLAKE2B256 BLAKE2B512E BLAKE2B512 BLAKE2B160E BLAKE2B160 BLAKE2B224E BLAKE2B224 BLAKE2B384E BLAKE2B384 BLAKE2BP512E BLAKE2BP512 BLAKE2S256E BLAKE2S256 BLAKE2S160E BLAKE2S160 BLAKE2S224E BLAKE2S224 BLAKE2SP256E BLAKE2SP256 BLAKE2SP224E BLAKE2SP224 SHA1E SHA1 MD5E MD5 WORM URL GITBUNDLE GITMANIFEST VURL X* +remote types: git gcrypt p2p S3 bup directory rsync web bittorrent webdav adb tahoe glacier ddar git-lfs httpalso borg rclone hook external +operating system: linux x86_64 +supported repository versions: 8 9 10 +upgrade supported from repository versions: 0 1 2 3 4 5 6 7 8 9 10 +local repository version: 10 +``` + +[[!meta author=yoh]] +[[!tag projects/dandi]]
implemented
diff --git a/doc/bugs/importtree_with_versioning__61__yes__58___check_first/comment_4_e1f144bf0391a3dff27fa3bb489a2891._comment b/doc/bugs/importtree_with_versioning__61__yes__58___check_first/comment_4_e1f144bf0391a3dff27fa3bb489a2891._comment new file mode 100644 index 0000000000..486be948bd --- /dev/null +++ b/doc/bugs/importtree_with_versioning__61__yes__58___check_first/comment_4_e1f144bf0391a3dff27fa3bb489a2891._comment @@ -0,0 +1,8 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 4""" + date="2024-11-12T18:32:53Z" + content=""" +The `checkbucketversioning` branch has this implemented, to be merged once +aws is released supporting it. +"""]]
comments
diff --git a/doc/bugs/importtree_with_versioning__61__yes__58___check_first/comment_2_482b88a01eb307794c1dd3f9ce8d7938._comment b/doc/bugs/importtree_with_versioning__61__yes__58___check_first/comment_2_482b88a01eb307794c1dd3f9ce8d7938._comment new file mode 100644 index 0000000000..582cf1b6b3 --- /dev/null +++ b/doc/bugs/importtree_with_versioning__61__yes__58___check_first/comment_2_482b88a01eb307794c1dd3f9ce8d7938._comment @@ -0,0 +1,10 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 2""" + date="2024-11-12T17:35:16Z" + content=""" +Made a pull request to aws <https://github.com/aristidb/aws/pull/292> + +(As sometimes S3 maintainer of aws, I'll probably accept it if nobody +objects to it.) +"""]] diff --git a/doc/bugs/importtree_with_versioning__61__yes__58___check_first/comment_3_76df8f40ff2fa1f5429e31ecedba6090._comment b/doc/bugs/importtree_with_versioning__61__yes__58___check_first/comment_3_76df8f40ff2fa1f5429e31ecedba6090._comment new file mode 100644 index 0000000000..50496de282 --- /dev/null +++ b/doc/bugs/importtree_with_versioning__61__yes__58___check_first/comment_3_76df8f40ff2fa1f5429e31ecedba6090._comment @@ -0,0 +1,23 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 3""" + date="2024-11-12T17:35:50Z" + content=""" +Wait though... We have signature=anonymous. So git-annex does in fact know +that this special remote is read-only. git-annex will never try to write to +it (even if the bucket somehow allowed anonymous writes) as long as it's +configured with signature=anonymous. + +So, it could just avoid trying to set versioning when signature=anonymous, +and assume the bucket has versioning enabled. + +Hmm, in lockContentS3, when versioning is enabled, it calls +checkVersioning, which checks if a S3 version ID has been recorded for the +file. What if the bucket did not actually have versioning enabled? Then an +import from it would not record a S3 version ID. That would make this, and +other places like checkKey that expect versioned buckets to have S3 version +IDs fail in unexpected ways. + +So, I guess I'm inclined to not go down this read-only path, and instead wait for +aws to get updated and use that. +"""]]
diff --git a/doc/bugs/autocompletion_of_path_not_showing_anything.mdwn b/doc/bugs/autocompletion_of_path_not_showing_anything.mdwn new file mode 100644 index 0000000000..cd27f3c3e2 --- /dev/null +++ b/doc/bugs/autocompletion_of_path_not_showing_anything.mdwn @@ -0,0 +1,47 @@ +### Please describe the problem. + +When using `git annex sync -C` / `git annex sync --content-of`, there is no suggestion of a path that is required by this argument. + + +### What steps will reproduce the problem? + +Type in `git annex sync -C` and hit <TAB> + + +### What version of git-annex are you using? On what operating system? + +git-annex: + +```shell +git-annex version: 10.20240129 +build flags: Assistant Webapp Pairing Inotify DBus DesktopNotify TorrentParser MagicMime Benchmark Feeds Testsuite S3 WebDAV +dependency versions: aws-0.24.1 bloomfilter-2.0.1.2 crypton-0.33 DAV-1.3.4 feed-1.3.2.1 ghc-9.4.7 http-client-0.7.14 persistent-sqlite-2.13.2.0 torrent-10000.1.3 uuid-1.3.15 yesod-1.6.2.1 +key/value backends: SHA256E SHA256 SHA512E SHA512 SHA224E SHA224 SHA384E SHA384 SHA3_256E SHA3_256 SHA3_512E SHA3_512 SHA3_224E SHA3_224 SHA3_384E SHA3_384 SKEIN256E SKEIN256 SKEIN512E SKEIN512 BLAKE2B256E BLAKE2B256 BLAKE2B512E BLAKE2B512 BLAKE2B160E BLAKE2B160 BLAKE2B224E BLAKE2B224 BLAKE2B384E BLAKE2B384 BLAKE2BP512E BLAKE2BP512 BLAKE2S256E BLAKE2S256 BLAKE2S160E BLAKE2S160 BLAKE2S224E BLAKE2S224 BLAKE2SP256E BLAKE2SP256 BLAKE2SP224E BLAKE2SP224 SHA1E SHA1 MD5E MD5 WORM URL X* +remote types: git gcrypt p2p S3 bup directory rsync web bittorrent webdav adb tahoe glacier ddar git-lfs httpalso borg hook external +operating system: linux x86_64 +supported repository versions: 8 9 10 +upgrade supported from repository versions: 0 1 2 3 4 5 6 7 8 9 10 +local repository version: 10 +``` + +OS: + +```shell +Distributor ID: Ubuntu +Description: Ubuntu 24.04.1 LTS +Release: 24.04 +Codename: noble +``` + +Shell: + +```shell +zsh 5.9 (x86_64-ubuntu-linux-gnu) +``` + +(no plugins, no oh-my-zsh or else, just plain old zsh) + + +### Have you had any luck using git-annex before? (Sometimes we get tired of reading bug reports all day and a lil' positive end note does wonders) + +Everything else is working fine, so far :-)
Added a comment
diff --git a/doc/git-annex-reregisterurl/comment_3_8ca7c754f9f69a96c68cbf8fc6a9e6ef._comment b/doc/git-annex-reregisterurl/comment_3_8ca7c754f9f69a96c68cbf8fc6a9e6ef._comment new file mode 100644 index 0000000000..9a597ddae5 --- /dev/null +++ b/doc/git-annex-reregisterurl/comment_3_8ca7c754f9f69a96c68cbf8fc6a9e6ef._comment @@ -0,0 +1,14 @@ +[[!comment format=mdwn + username="yarikoptic" + avatar="http://cdn.libravatar.org/avatar/f11e9c84cb18d26a1748c33b48c924b4" + subject="comment 3" + date="2024-11-11T22:18:37Z" + content=""" +Thank you Joey! + +> Except for the web special remote, which is a special case. I guess yours special remote also may be a special case, ... + +or in the other words, \"not that special\" after all. I see it just a common enough pattern for use of this new `reregisterurl` that in my case it would be easier to just add an option instead of chaining manually `reregisterurl`, figure out what key one way or another, call `setpresentkey` with `0`. After all it is for a user to announce one way or another. + +Anyways, I am ok for now, hopefully would not forget next time I use it to chase with that extra invocation. +"""]]
Added a comment
diff --git a/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_2_c652de8ebcc4cf96ba5e76b68cc2e083._comment b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_2_c652de8ebcc4cf96ba5e76b68cc2e083._comment new file mode 100644 index 0000000000..e9f5185021 --- /dev/null +++ b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_2_c652de8ebcc4cf96ba5e76b68cc2e083._comment @@ -0,0 +1,70 @@ +[[!comment format=mdwn + username="yarikoptic" + avatar="http://cdn.libravatar.org/avatar/f11e9c84cb18d26a1748c33b48c924b4" + subject="comment 2" + date="2024-11-11T22:11:13Z" + content=""" +nope -- I do not see any traces of such configuration anywhere + +``` +yoh@typhon:~/proj/dandi/s5cmd-dandi$ git status +HEAD detached from 9f3a363 +Untracked files: + (use \"git add <file>...\" to include in what will be committed) + .duct/logs/2024.10.30T14.59.27-418623_info.json + .duct/logs/2024.10.30T14.59.27-418623_stderr + .duct/logs/2024.10.30T14.59.27-418623_stdout + .duct/logs/2024.10.30T14.59.27-418623_usage.json + .duct/logs/2024.11.04T12.31.25-2144989_info.json + .duct/logs/2024.11.04T12.31.25-2144989_stderr + .duct/logs/2024.11.04T12.31.25-2144989_stdout + .duct/logs/2024.11.04T12.31.25-2144989_usage.json + +nothing added to commit but untracked files present (use \"git add\" to track) +yoh@typhon:~/proj/dandi/s5cmd-dandi$ ls -ld .duct/logs/2024.10.30T14.59.27-418623_stdout +-rw-r--r-- 1 yoh yoh 54902635452 Nov 2 05:43 .duct/logs/2024.10.30T14.59.27-418623_stdout +yoh@typhon:~/proj/dandi/s5cmd-dandi$ cat .gitattributes +* annex.backend=MD5E +**/.git* annex.largefiles=nothing +yoh@typhon:~/proj/dandi/s5cmd-dandi$ git annex config --get annex.largefiles +yoh@typhon:~/proj/dandi/s5cmd-dandi$ git config --get annex.largefiles +yoh@typhon:~/proj/dandi/s5cmd-dandi$ cat .git/config +[core] + repositoryformatversion = 0 + filemode = true + bare = false + logallrefupdates = true +[remote \"origin\"] + url = https://github.com/dandi/s5cmd-dandi + fetch = +refs/heads/*:refs/remotes/origin/* + annex-ignore = true +[branch \"master\"] + remote = origin + merge = refs/heads/master +[annex] + uuid = d80bb1cb-d07b-45b7-bdb1-b4a3f0bf4131 + version = 10 +[filter \"annex\"] + smudge = git-annex smudge -- %f + clean = git-annex smudge --clean -- %f + process = git-annex filter-process +yoh@typhon:~/proj/dandi/s5cmd-dandi$ git check-attr annex.largefiles .duct/logs/2024.11.04T12.31.25-2144989_stdout +.duct/logs/2024.11.04T12.31.25-2144989_stdout: annex.largefiles: unspecified +yoh@typhon:~/proj/dandi/s5cmd-dandi$ git annex add .duct/logs/2024.11.04T12.31.25-2144989_stdout +add .duct/logs/2024.11.04T12.31.25-2144989_stdout (non-large file; adding content to git repository) ok +(recording state in git...) +fatal: Unable to create '/home/yoh/proj/dandi/s5cmd-dandi/.git/index.lock': File exists. + +Another git process seems to be running in this repository, e.g. +an editor opened by 'git commit'. Please make sure all processes +are terminated then try again. If it still fails, a git process +may have crashed in this repository earlier: +remove the file manually to continue. +yoh@typhon:~/proj/dandi/s5cmd-dandi$ fuser -v /home/yoh/proj/dandi/s5cmd-dandi/.git/index.lock +yoh@typhon:~/proj/dandi/s5cmd-dandi$ ls -ld /home/yoh/proj/dandi/s5cmd-dandi/.git/index.lock +-rw-r--r-- 1 yoh yoh 0 Nov 7 10:47 /home/yoh/proj/dandi/s5cmd-dandi/.git/index.lock +yoh@typhon:~/proj/dandi/s5cmd-dandi$ git annex version --raw +10.20241031-1~ndall+1 +``` + +"""]]
comment
diff --git a/doc/bugs/importtree_with_versioning__61__yes__58___check_first/comment_1_585dbd71cf35b8185c34d2847105e1de._comment b/doc/bugs/importtree_with_versioning__61__yes__58___check_first/comment_1_585dbd71cf35b8185c34d2847105e1de._comment new file mode 100644 index 0000000000..b5717babf7 --- /dev/null +++ b/doc/bugs/importtree_with_versioning__61__yes__58___check_first/comment_1_585dbd71cf35b8185c34d2847105e1de._comment @@ -0,0 +1,20 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2024-11-11T20:11:37Z" + content=""" +Unfortunately <https://hackage.haskell.org/package/aws> does not implement +the versioning check, so it will need to be added there. And it tends to take +some time for new versions of the build dependency to reach everywhere. + +<https://github.com/aristidb/aws/issues/290> + +I do think that is the only safe way to go though. I considered making +git-annex assume that a bucket where versioning cannot be set is read-only. +If git-annex is really never going to write to a bucket, it's safe to +assume versioning is enabled. But, unfortunately, ACLs can sometimes +prevent changing configs like versioning, but still allow other write +operations. Also, a S3 remote might be initialized without permission to +write to an existing bucket, but later S3 creds be used that do allow +writing. +"""]]
response
diff --git a/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_1_c853700b67cc30ac3f6a9796e182b0c9._comment b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_1_c853700b67cc30ac3f6a9796e182b0c9._comment new file mode 100644 index 0000000000..a0a1c26bbc --- /dev/null +++ b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__/comment_1_c853700b67cc30ac3f6a9796e182b0c9._comment @@ -0,0 +1,13 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2024-11-11T18:22:11Z" + content=""" +I don't see how that could happen unless you still had an annex.largefiles +config somewhere. But --force-large would surely bypass +all configs and make it be added as a large file. + +The places you could have annex.largefiles set that you didn't show are in +git config, and in git-annex config. My guess is it was in git config, +since that place to set it does override any gitattributes settings. +"""]]
response
diff --git a/doc/git-annex-reregisterurl/comment_2_c3754e2f52aa3094c1d1345dfdb014dd._comment b/doc/git-annex-reregisterurl/comment_2_c3754e2f52aa3094c1d1345dfdb014dd._comment new file mode 100644 index 0000000000..988ed4bcee --- /dev/null +++ b/doc/git-annex-reregisterurl/comment_2_c3754e2f52aa3094c1d1345dfdb014dd._comment @@ -0,0 +1,15 @@ +[[!comment format=mdwn + username="joey" + subject="""Re: cannot "unregister" fully""" + date="2024-11-11T18:18:38Z" + content=""" +You can use `git-annex setpresentkey` to tell git-annex that content is not +present in your special remote. + +Usually whether or not an url is currently registered with a special remote +does not affect whether content is actually stored in it, which is why +these are 2 different things. Except for the web special remote, which is a +special case. I guess yours special remote also may be a special case, but +git-annex doesn't know about it, and so why not use setpresentkey to handle +the special case, rather than further complicating reregisterurl with it? +"""]]
vpop: Only update state after successful checkout
If checkout fails for some reason, they're still in a view, and should be
able to vpop again.
If checkout fails for some reason, they're still in a view, and should be
able to vpop again.
diff --git a/CHANGELOG b/CHANGELOG index b025184df3..1fda614aa9 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ git-annex (10.20241032) UNRELEASED; urgency=medium * git-remote-annex: Fix a reversion introduced in version 10.20241031 that broke cloning from a special remote. + * vpop: Only update state after successful checkout. -- Joey Hess <id@joeyh.name> Mon, 11 Nov 2024 12:26:00 -0400 diff --git a/Command/VPop.hs b/Command/VPop.hs index a5fac3ac37..80912b4486 100644 --- a/Command/VPop.hs +++ b/Command/VPop.hs @@ -28,22 +28,29 @@ start ps = go =<< currentView where go Nothing = giveup "Not in a view." go (Just (v, madj)) = starting "vpop" ai si $ do - removeView v - (oldvs, vs) <- splitAt (num - 1) . filter (sameparentbranch v) + (oldvs, vs) <- splitAt (num - 1) + . filter (sameparentbranch v) + . filter (/= v) <$> recentViews - mapM_ removeView oldvs - case vs of - (oldv:_) -> next $ do + let removeview = mapM_ removeView (v : oldvs) + ok <- case vs of + (oldv:_) -> do showOutput checkoutViewBranch oldv madj (\v' madj' -> return (branchView v' madj')) - _ -> next $ do + _ -> do showOutput inRepo $ Git.Command.runBool [ Param "checkout" , Param $ Git.fromRef $ Git.Ref.base $ viewParentBranch v ] + if ok + then + next $ do + removeview + return True + else next $ return False sameparentbranch a b = viewParentBranch a == viewParentBranch b num = fromMaybe 1 $ readish =<< headMaybe ps diff --git a/doc/forum/Stuck_in_git_annex_view__58___file_name_too_long/comment_2_7b29f41866000bbdcd1e90b3a3105716._comment b/doc/forum/Stuck_in_git_annex_view__58___file_name_too_long/comment_2_7b29f41866000bbdcd1e90b3a3105716._comment new file mode 100644 index 0000000000..9fdaa9546b --- /dev/null +++ b/doc/forum/Stuck_in_git_annex_view__58___file_name_too_long/comment_2_7b29f41866000bbdcd1e90b3a3105716._comment @@ -0,0 +1,28 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 2""" + date="2024-11-11T17:56:35Z" + content=""" +This can certainly happen if files in the repository are in a directory +path that, when converted to a filename, is too long. + +I tried reproducing it, and was basically able to get into the same state +as you. The reason both "git-annex vpop" and "git checkout master" fail +is that since the long files were staged in git, but unable to be written, +git considers them to be deleted. And so it refuses to do a checkout, +with "Your local changes to the following files would +be overwritten by checkout" + +I was able to resolve it by deleting the directory that contained those +too long files. Which was empty anyway. That made git treat those files +as deleted, and allowed "git checkout master" to work, as well as +"git-annex vpop". + +The reason "git-annex vpop" failed for you with "Not in a view" is that was +actually the second time you ran it, and the first time, despite the git +checkout failing, it had proceeded to update git-annex's state to say it had +popped out of the view. I've fixed that bug. + +As to whether git-annex should try to detect this and avoid entering such a +view, I dunno.. +"""]]
close
diff --git a/doc/bugs/git-remote-annex_fails_createDirectoryFrom_not_located_in_git.mdwn b/doc/bugs/git-remote-annex_fails_createDirectoryFrom_not_located_in_git.mdwn index 2b84685e8a..71981ddfcc 100644 --- a/doc/bugs/git-remote-annex_fails_createDirectoryFrom_not_located_in_git.mdwn +++ b/doc/bugs/git-remote-annex_fails_createDirectoryFrom_not_located_in_git.mdwn @@ -8,3 +8,5 @@ Not limited to directory special remote, also happens with rclone. --[[Joey]] [[!tag projects/INM7]] + +> [[fixed|done]] and test suite updated --[[Joey]]
git-remote-annex: Fix a reversion
Introduced in version 10.20241031 that broke cloning from a special remote
retrieveKeyFile changed to use createAnnexDirectory, which means that the
path passed to it needs to be under .git
git-remote-annex is probably the only thing in git-annex where that was not
the case. And there's no real reason it cannot be the case with it either.
Just use withOtherTmp.
Introduced in version 10.20241031 that broke cloning from a special remote
retrieveKeyFile changed to use createAnnexDirectory, which means that the
path passed to it needs to be under .git
git-remote-annex is probably the only thing in git-annex where that was not
the case. And there's no real reason it cannot be the case with it either.
Just use withOtherTmp.
diff --git a/CHANGELOG b/CHANGELOG index 584ab7deda..b025184df3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,10 @@ +git-annex (10.20241032) UNRELEASED; urgency=medium + + * git-remote-annex: Fix a reversion introduced in version 10.20241031 + that broke cloning from a special remote. + + -- Joey Hess <id@joeyh.name> Mon, 11 Nov 2024 12:26:00 -0400 + git-annex (10.20241031) upstream; urgency=medium * Sped up proxied downloads from special remotes, by streaming. diff --git a/CmdLine/GitRemoteAnnex.hs b/CmdLine/GitRemoteAnnex.hs index 5d95c34bb1..36d2446e4e 100644 --- a/CmdLine/GitRemoteAnnex.hs +++ b/CmdLine/GitRemoteAnnex.hs @@ -48,6 +48,7 @@ import Annex.Init import Annex.UUID import Annex.Content import Annex.Perms +import Annex.Tmp import Annex.SpecialRemote.Config import Remote.List import Remote.List.Util @@ -719,13 +720,14 @@ downloadManifest rmt = get mkmain >>= maybe (get mkbak) (pure . Just) -- directory. The content of manifests is not stable, and so -- it needs to re-download it fresh every time, and the object -- file should not be stored locally. - gettotmp dl = withTmpFile "GITMANIFEST" $ \tmp tmph -> do - liftIO $ hClose tmph - _ <- dl tmp - b <- liftIO (B.readFile tmp) - case parseManifest b of - Right m -> Just <$> verifyManifest rmt m - Left err -> giveup err + gettotmp dl = withOtherTmp $ \othertmp -> + withTmpFileIn (fromRawFilePath othertmp) "GITMANIFEST" $ \tmp tmph -> do + liftIO $ hClose tmph + _ <- dl tmp + b <- liftIO (B.readFile tmp) + case parseManifest b of + Right m -> Just <$> verifyManifest rmt m + Left err -> giveup err getexport _ [] = return Nothing getexport mk (loc:locs) = diff --git a/doc/bugs/git-remote-annex_fails_createDirectoryFrom_not_located_in_git/comment_2_6dda2cfd76732792a373ea2cc359ffb3._comment b/doc/bugs/git-remote-annex_fails_createDirectoryFrom_not_located_in_git/comment_2_6dda2cfd76732792a373ea2cc359ffb3._comment new file mode 100644 index 0000000000..483abc4279 --- /dev/null +++ b/doc/bugs/git-remote-annex_fails_createDirectoryFrom_not_located_in_git/comment_2_6dda2cfd76732792a373ea2cc359ffb3._comment @@ -0,0 +1,11 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 2""" + date="2024-11-11T16:39:48Z" + content=""" +Fixed by changing it to download to a path under .git + +Also, this directory special remote case makes a good test case so I will +add it to the test suite, which will be the first git-remote-annex in the +test suite. +"""]]
analysis
diff --git a/doc/bugs/git-remote-annex_fails_createDirectoryFrom_not_located_in_git/comment_1_d0d2c7bd1bb09b5efec590600b37e40e._comment b/doc/bugs/git-remote-annex_fails_createDirectoryFrom_not_located_in_git/comment_1_d0d2c7bd1bb09b5efec590600b37e40e._comment new file mode 100644 index 0000000000..bab2ad8777 --- /dev/null +++ b/doc/bugs/git-remote-annex_fails_createDirectoryFrom_not_located_in_git/comment_1_d0d2c7bd1bb09b5efec590600b37e40e._comment @@ -0,0 +1,14 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2024-11-11T16:20:37Z" + content=""" +Bisected the reversion to [[!commit 835283b86240ef4b68c44f4332ac1b644e08e49f]]. + +A createAnnexDirectory call gets added to fileRetriever in that commit, +and that uses createDirectoryUnder. In the case of git-remote-annex, +the destination file is not under the .git directory, and so it fails. +Before this commit, it was retrieved to under the .git directory +and then renamed into place, and that rename didn't care if the destination +was not under .git. +"""]]
break out new bug report
diff --git a/doc/bugs/git-remote-annex_fails_createDirectoryFrom_not_located_in_git.mdwn b/doc/bugs/git-remote-annex_fails_createDirectoryFrom_not_located_in_git.mdwn new file mode 100644 index 0000000000..2b84685e8a --- /dev/null +++ b/doc/bugs/git-remote-annex_fails_createDirectoryFrom_not_located_in_git.mdwn @@ -0,0 +1,10 @@ +Likely a reversion, but I have not checked. + + joey@darkstar:~/bench>git clone 'annex::4cdb3781-ee3c-4830-b937-a2ec2015aebc?encryption=none&type=directory&directory=/home/joey/bench/d' xx + Cloning into 'xx'... + git-annex: /home/joey/tmp: createDirectoryFrom: not located in xx/.git: user error + +Not limited to directory special remote, also happens with rclone. +--[[Joey]] + +[[!tag projects/INM7]] diff --git a/doc/bugs/git_remote_annex_-_rclone/comment_6_2d742ac66649225d9dfff9ce8414481d._comment b/doc/bugs/git_remote_annex_-_rclone/comment_6_2d742ac66649225d9dfff9ce8414481d._comment new file mode 100644 index 0000000000..f180fda1fd --- /dev/null +++ b/doc/bugs/git_remote_annex_-_rclone/comment_6_2d742ac66649225d9dfff9ce8414481d._comment @@ -0,0 +1,8 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 6""" + date="2024-11-11T16:02:21Z" + content=""" +That problem is not limited to rclone. I opened a bug +[[git-remote-annex_fails_createDirectoryFrom_not_located_in_git]] +"""]]
diff --git a/doc/bugs/assistant_xdg-autostart_unit_should_not_fail.mdwn b/doc/bugs/assistant_xdg-autostart_unit_should_not_fail.mdwn new file mode 100644 index 0000000000..12dc1710ee --- /dev/null +++ b/doc/bugs/assistant_xdg-autostart_unit_should_not_fail.mdwn @@ -0,0 +1,23 @@ +### Please describe the problem. + +My distro was configured to install all of my environment's packages' autostart files into the global environment unbeknownst to me which became evident after I finally started getting graphical session startup right. Git annex ships one of these files for the assistant (which I don't use) that does `git-annex assistant --autostart`. + +When looking at `systemctl --user status`, I found my system to be in a "degraded" state because the `app-git\x2dannex@autostart.service` unit starts and fails every time I log into a `graphical-session.target`: + +``` +Nov 11 00:23:51 THESEUS systemd[1912]: Starting Git Annex Assistant... +Nov 11 00:23:51 THESEUS systemd[1912]: Started Git Annex Assistant. +Nov 11 00:23:51 THESEUS git-annex[373988]: git-annex: Nothing listed in /Users/atemu/.config/git-annex/autostart +Nov 11 00:23:51 THESEUS systemd[1912]: app-git\x2dannex@autostart.service: Main process exited, code=exited, status=1/FAILURE +Nov 11 00:23:51 THESEUS systemd[1912]: app-git\x2dannex@autostart.service: Failed with result 'exit-code'. +``` + +This is surprising given that I don't use this feature. It shouldn't be a failure because it's expected for the unit to not do anything when not explicitly configured to do so (as evidenced by the config file not being present). + +git-annex assistant's xdg-autostart file should be set up in a way such that not finding the autostart file is not considered a failure and does not make the resulting systemd unit fail. + +(I've since deactivated xdg-autostart because I don't intend to use it but this should be fixed regardless.) + +### Have you had any luck using git-annex before? (Sometimes we get tired of reading bug reports all day and a lil' positive end note does wonders) + +Hands down the best way to manage your files :)
Added a comment: Follow-up
diff --git a/doc/bugs/tls__58___peer_does_not_support_Extended_Main_Secret/comment_4_910360d29f22bdf0fdd51d8b7e4454f0._comment b/doc/bugs/tls__58___peer_does_not_support_Extended_Main_Secret/comment_4_910360d29f22bdf0fdd51d8b7e4454f0._comment new file mode 100644 index 0000000000..7ca8179a7d --- /dev/null +++ b/doc/bugs/tls__58___peer_does_not_support_Extended_Main_Secret/comment_4_910360d29f22bdf0fdd51d8b7e4454f0._comment @@ -0,0 +1,10 @@ +[[!comment format=mdwn + username="AlexPraga" + avatar="http://cdn.libravatar.org/avatar/7c4e10fd352b81279b405f9f5337cdb7" + subject="Follow-up" + date="2024-11-07T21:46:32Z" + content=""" +Hi, +I've been bit by this bug and cannot depend on the website to update its TLS. Is there a way for git-annex to \"ignore\" such a restriction (apart from using an older version) ? +Thanks, +"""]]
diff --git a/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn b/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn index baf88934f1..01c34bb718 100644 --- a/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn +++ b/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn @@ -1,6 +1,6 @@ ### Please describe the problem. -I wanted to use this remote to "crawl" S3 bucket in `importtree=yes` mode. Bucket (dandiarchive) supports versioning, so it would be great to enable versioning here as well so URLs would use versionId. But unfortunately adding `versioning=yes` makes `git-annex` to try to establish versioning on the bucket (even if it is already enabled). +I wanted to use S3 special remote to "crawl" S3 bucket in `importtree=yes` mode. Bucket (dandiarchive) supports versioning, so it would be great to enable versioning here as well so URLs would use versionId. But unfortunately adding `versioning=yes` makes `git-annex` to try to establish versioning on the bucket (even if it is already enabled). command to try with (should work for anyone since public bucket):
add empty line so ``` is formatted correctly
diff --git a/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn b/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn index ac5294e5b6..baf88934f1 100644 --- a/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn +++ b/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn @@ -9,6 +9,7 @@ git annex --debug initremote s3-dandiarchive bucket=dandiarchive type=S3 encrypt ``` to see that annex (I use 10.20240927) would try to enable versioning: + ``` (enabling bucket versioning...) [2024-11-07 16:30:37.830416324] (Remote.S3) String to sign: "PUT\n\n\nThu, 07 Nov 2024 21:30:37 GMT\n/dandiarchive/?versioning" [2024-11-07 16:30:37.830449238] (Remote.S3) Host: "dandiarchive.s3.amazonaws.com"
initial remote on need to check if versioning already enabled and thus avoid PUT
diff --git a/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn b/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn new file mode 100644 index 0000000000..ac5294e5b6 --- /dev/null +++ b/doc/bugs/importtree_with_versioning__61__yes__58___check_first.mdwn @@ -0,0 +1,31 @@ +### Please describe the problem. + +I wanted to use this remote to "crawl" S3 bucket in `importtree=yes` mode. Bucket (dandiarchive) supports versioning, so it would be great to enable versioning here as well so URLs would use versionId. But unfortunately adding `versioning=yes` makes `git-annex` to try to establish versioning on the bucket (even if it is already enabled). + +command to try with (should work for anyone since public bucket): + +``` +git annex --debug initremote s3-dandiarchive bucket=dandiarchive type=S3 encryption=none importtree=yes publicurl=https://dandiarchive.s3.amazonaws.com/ fileprefix=dandisets/000027/ signature=anonymous versioning=yes +``` + +to see that annex (I use 10.20240927) would try to enable versioning: +``` +(enabling bucket versioning...) [2024-11-07 16:30:37.830416324] (Remote.S3) String to sign: "PUT\n\n\nThu, 07 Nov 2024 21:30:37 GMT\n/dandiarchive/?versioning" +[2024-11-07 16:30:37.830449238] (Remote.S3) Host: "dandiarchive.s3.amazonaws.com" +[2024-11-07 16:30:37.830459034] (Remote.S3) Path: "/" +[2024-11-07 16:30:37.830470676] (Remote.S3) Query string: "versioning" +[2024-11-07 16:30:37.830480666] (Remote.S3) Header: [("Date","Thu, 07 Nov 2024 21:30:37 GMT")] +[2024-11-07 16:30:37.830498329] (Remote.S3) Body: "<?xml version=\"1.0\" encoding=\"UTF-8\"?><VersioningConfiguration xmlns=\"http://s3.amazonaws.com/doc/2006-03-01/\"><Status>Enabled</Status></VersioningConfiguration>" +[2024-11-07 16:30:37.879924822] (Remote.S3) Response status: Status {statusCode = 403, statusMessage = "Forbidden"} +``` + +It seems to be easy to check if versioning enabled: + +``` +❯ curl -s "https://dandiarchive.s3.amazonaws.com/?versioning" +<?xml version="1.0" encoding="UTF-8"?> +<VersioningConfiguration xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Status>Enabled</Status></VersioningConfiguration> +``` + +[[!meta author=yoh]] +[[!tag projects/dandi]]
removed
diff --git a/doc/tips/storing_data_in_git-lfs/comment_2_508186714106491bb1a06e66fe86a5f8._comment b/doc/tips/storing_data_in_git-lfs/comment_2_508186714106491bb1a06e66fe86a5f8._comment deleted file mode 100644 index c44c58440c..0000000000 --- a/doc/tips/storing_data_in_git-lfs/comment_2_508186714106491bb1a06e66fe86a5f8._comment +++ /dev/null @@ -1,9 +0,0 @@ -[[!comment format=mdwn - username="marc+bugs@bd1c733f9e5bee7c2279a9c649e80fac0e540c14" - nickname="marc+bugs" - avatar="http://cdn.libravatar.org/avatar/ed6686640356357f32b5cc7f4ace9471" - subject="Examples" - date="2024-11-07T14:13:27Z" - content=""" -Shouldn't the urls of the remotes `lfs` and `lfs-http` in the examples on this page be reversed? -"""]]
Added a comment: Examples
diff --git a/doc/tips/storing_data_in_git-lfs/comment_2_508186714106491bb1a06e66fe86a5f8._comment b/doc/tips/storing_data_in_git-lfs/comment_2_508186714106491bb1a06e66fe86a5f8._comment new file mode 100644 index 0000000000..c44c58440c --- /dev/null +++ b/doc/tips/storing_data_in_git-lfs/comment_2_508186714106491bb1a06e66fe86a5f8._comment @@ -0,0 +1,9 @@ +[[!comment format=mdwn + username="marc+bugs@bd1c733f9e5bee7c2279a9c649e80fac0e540c14" + nickname="marc+bugs" + avatar="http://cdn.libravatar.org/avatar/ed6686640356357f32b5cc7f4ace9471" + subject="Examples" + date="2024-11-07T14:13:27Z" + content=""" +Shouldn't the urls of the remotes `lfs` and `lfs-http` in the examples on this page be reversed? +"""]]
Added a comment: cannot "unregister" fully
diff --git a/doc/git-annex-reregisterurl/comment_1_2b77b76dd091ff7d3df776c700b29fe6._comment b/doc/git-annex-reregisterurl/comment_1_2b77b76dd091ff7d3df776c700b29fe6._comment new file mode 100644 index 0000000000..5d62a6fe8e --- /dev/null +++ b/doc/git-annex-reregisterurl/comment_1_2b77b76dd091ff7d3df776c700b29fe6._comment @@ -0,0 +1,20 @@ +[[!comment format=mdwn + username="yarikoptic" + avatar="http://cdn.libravatar.org/avatar/f11e9c84cb18d26a1748c33b48c924b4" + subject="cannot "unregister" fully" + date="2024-11-05T20:59:15Z" + content=""" +> Note that, like git-annex unregisterurl, using this option unregisters an url from a special remote, but it does not mark the content as not present in that special remote. + +But then how to mark \"not present\" in that remote? After `git-annex-reregisterurl` I still have `[datalad]` remote listed although url now, correctly, associated with `web`. I have tried `drop` but that didn't work + +```shell +❯ git annex drop --from datalad images/neurodesk/neurodesk-afni--21.2.00.simg +drop images/neurodesk/neurodesk-afni--21.2.00.simg (from datalad...) (locking smaug...) + Removal of content from urls is not possible +failed +drop: 1 failed +``` + +Could there may be option `--fully` or `--mark-not-present` option to make it 1 invocation operation? +"""]]
Added a comment: Re: corruption using git-annex-remote-rclone
diff --git a/doc/special_remotes/rclone/comment_7_6360df4c153cc904ff98ba4d98987891._comment b/doc/special_remotes/rclone/comment_7_6360df4c153cc904ff98ba4d98987891._comment new file mode 100644 index 0000000000..baf218374e --- /dev/null +++ b/doc/special_remotes/rclone/comment_7_6360df4c153cc904ff98ba4d98987891._comment @@ -0,0 +1,16 @@ +[[!comment format=mdwn + username="mike@2d6d71f56ce2a992244350475251df87c26fe351" + nickname="mike" + avatar="http://cdn.libravatar.org/avatar/183fa439752e2f0c6f39ede658d81050" + subject="Re: corruption using git-annex-remote-rclone" + date="2024-11-05T15:43:09Z" + content=""" +I think this only happens with some rclone remote backends (like pcloud). The pcloud backend definitely keeps partially uploaded files, under the name of the full file. The backend attempts to do the right thing and uses the `nopartial` option of the pcloud API, but this does not work as it should [1]. + +I believe the latest rclone updates in 1.68.x should fix this issue, because they handle partial uploads in rclone itself [2]. + +Re: `rclone gitannex`: I only updated one client to use this, but I've also been careful to never interrupt uploads, so I can't tell. But I don't see how it behaves differently in this regard. + +[1] https://forum.rclone.org/t/pcloud-keeps-partial-uploads/46026 +[2] See changelog, the OpenWriterAt feature implies PartialUploads: https://rclone.org/changelog/#v1-68-0-2024-09-08 +"""]]
initial report on keeping adding to git
diff --git a/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__.mdwn b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__.mdwn new file mode 100644 index 0000000000..4b4a092ffe --- /dev/null +++ b/doc/bugs/keeps_adding_to_git_even_though_largerthan__61__.mdwn @@ -0,0 +1,63 @@ +### Please describe the problem. + +I had a pure git repo, and then too large file was committed, I `git reset HEAD^` went through `datalad create -c text2git -f .` to give default configuration with `* annex.largefiles=((mimeencoding=binary)and(largerthan=0))` and tried to `git annex add` that large text file -- but annex kept adding to git. I simplified `largefiles` further -- keeps adding to git: + + +``` +yoh@typhon:~/proj/dandi/s5cmd-dandi$ git check-attr annex.largefiles .duct/logs/2024.10.30T14.59.27-418623_stdout +.duct/logs/2024.10.30T14.59.27-418623_stdout: annex.largefiles: largerthan=100kb + +yoh@typhon:~/proj/dandi/s5cmd-dandi$ git annex add .duct/logs/2024.10.30T14.59.27-418623_stdout +add .duct/logs/2024.10.30T14.59.27-418623_stdout (non-large file; adding content to git repository) ^C + +yoh@typhon:~/proj/dandi/s5cmd-dandi$ git status +On branch master +Your branch is ahead of 'origin/master' by 9 commits. + (use "git push" to publish your local commits) + +Untracked files: + (use "git add <file>..." to include in what will be committed) + .duct/logs/2024.10.30T14.59.27-418623_info.json + .duct/logs/2024.10.30T14.59.27-418623_stderr + .duct/logs/2024.10.30T14.59.27-418623_stdout + .duct/logs/2024.10.30T14.59.27-418623_usage.json + .duct/logs/2024.11.04T12.31.25-2144989_info.json + .duct/logs/2024.11.04T12.31.25-2144989_stderr + .duct/logs/2024.11.04T12.31.25-2144989_stdout + .duct/logs/2024.11.04T12.31.25-2144989_usage.json + +nothing added to commit but untracked files present (use "git add" to track) +yoh@typhon:~/proj/dandi/s5cmd-dandi$ du -k .duct/logs/2024.10.30T14.59.27-418623_stdout +53615856 .duct/logs/2024.10.30T14.59.27-418623_stdout + +``` + +even if I remove any explicit rule, and try to annex -- goes to git ... + +``` +yoh@typhon:~/proj/dandi/s5cmd-dandi$ git annex add .duct/logs/2024.10.30T14.59.27-418623_stdout +add .duct/logs/2024.10.30T14.59.27-418623_stdout (non-large file; adding content to git repository) ^C + +yoh@typhon:~/proj/dandi/s5cmd-dandi$ git check-attr annex.largefiles .duct/logs/2024.10.30T14.59.27-418623_stdout +.duct/logs/2024.10.30T14.59.27-418623_stdout: annex.largefiles: unspecified +``` + +here is debug output + +``` +yoh@typhon:~/proj/dandi/s5cmd-dandi$ git annex add --debug .duct/logs/2024.10.30T14.59.27-418623_stdout +[2024-11-04 12:51:00.940826688] (Utility.Process) process [2203424] read: git ["--git-dir=.git","--work-tree=.","--literal-pathspecs","-c","annex.debug=true","show-ref","git-annex"] +[2024-11-04 12:51:00.945662132] (Utility.Process) process [2203424] done ExitSuccess +[2024-11-04 12:51:00.946114536] (Utility.Process) process [2203425] read: git ["--git-dir=.git","--work-tree=.","--literal-pathspecs","-c","annex.debug=true","show-ref","--hash","refs/heads/git-annex"] +[2024-11-04 12:51:00.950113577] (Utility.Process) process [2203425] done ExitSuccess +[2024-11-04 12:51:00.951187839] (Utility.Process) process [2203426] chat: git ["--git-dir=.git","--work-tree=.","--literal-pathspecs","-c","annex.debug=true","cat-file","--batch"] +[2024-11-04 12:51:00.95355332] (Utility.Process) process [2203427] read: git ["--git-dir=.git","--work-tree=.","--literal-pathspecs","-c","annex.debug=true","symbolic-ref","-q","HEAD"] +[2024-11-04 12:51:00.957754692] (Utility.Process) process [2203427] done ExitFailure 1 +[2024-11-04 12:51:00.958076235] (Utility.Process) process [2203428] read: git ["--git-dir=.git","--work-tree=.","--literal-pathspecs","-c","annex.debug=true","ls-files","-z","--others","--exclude-standard","--",".duct/logs/2024.10.30T14.59.27-418623_stdout"] +add .duct/logs/2024.10.30T14.59.27-418623_stdout (non-large file; adding content to git repository) [2024-11-04 12:51:00.960616592] (Utility.Process) process [2203429] chat: git ["--git-dir=.git","--work-tree=.","--literal-pathspecs","-c","annex.debug=true","hash-object","-w","--no-filters","--stdin-paths"] +^C +``` + +How to remedy? + +dandi project
Added a comment
diff --git a/doc/bugs/git_remote_annex_-_rclone/comment_5_d0f2588ed0da0edac4932ac3f1e2d8d8._comment b/doc/bugs/git_remote_annex_-_rclone/comment_5_d0f2588ed0da0edac4932ac3f1e2d8d8._comment new file mode 100644 index 0000000000..e5774cf382 --- /dev/null +++ b/doc/bugs/git_remote_annex_-_rclone/comment_5_d0f2588ed0da0edac4932ac3f1e2d8d8._comment @@ -0,0 +1,56 @@ +[[!comment format=mdwn + username="matrss" + avatar="http://cdn.libravatar.org/avatar/59541f50d845e5f81aff06e88a38b9de" + subject="comment 5" + date="2024-11-04T14:52:55Z" + content=""" +With the latest git-annex-standalone I am now getting a different error: initially pushing to an rclone special remote seems to work, but cloning and other operations like `git remote show {remote}` fail: + +``` +$ git annex initremote icg1220-remote-dir type=rclone encryption=none rcloneremotename=':sftp,host=icg1220' +initremote icg1220-remote-dir 2024/11/04 15:48:19 NOTICE: Can't save config \"shell_type\" for on the fly backend \":sftp\" +ok +(recording state in git...) +$ git annex enableremote icg1220-remote-dir --with-url +enableremote icg1220-remote-dir 2024/11/04 15:48:27 NOTICE: Can't save config \"shell_type\" for on the fly backend \":sftp\" +ok +(recording state in git...) +$ git annex push +copy qwer.txt 2024/11/04 15:48:32 NOTICE: Can't save config \"shell_type\" for on the fly backend \":sftp\" +(to icg1220-remote-dir...) +2024/11/04 15:48:33 NOTICE: Can't save config \"md5sum_command\" for on the fly backend \":sftp\" +2024/11/04 15:48:33 NOTICE: Can't save config \"sha1sum_command\" for on the fly backend \":sftp\" +ok +(recording state in git...) +push icg1220-remote-dir +Full remote url: annex::85e65a54-6bff-49a7-9e62-db59d8912ceb?encryption=none&type=rclone +2024/11/04 15:48:35 NOTICE: Can't save config \"shell_type\" for on the fly backend \":sftp\" +2024/11/04 15:48:35 NOTICE: Can't save config \"md5sum_command\" for on the fly backend \":sftp\" +2024/11/04 15:48:35 NOTICE: Can't save config \"sha1sum_command\" for on the fly backend \":sftp\" +To annex:: + * [new branch] main -> synced/main + * [new branch] git-annex -> synced/git-annex +Full remote url: annex::85e65a54-6bff-49a7-9e62-db59d8912ceb?encryption=none&type=rclone +2024/11/04 15:48:39 NOTICE: Can't save config \"shell_type\" for on the fly backend \":sftp\" +git-annex: /tmp: createDirectoryFrom: not located in .git: user error +ok +$ git remote show icg1220-remote-dir +Full remote url: annex::85e65a54-6bff-49a7-9e62-db59d8912ceb?encryption=none&type=rclone +2024/11/04 15:48:54 NOTICE: Can't save config \"shell_type\" for on the fly backend \":sftp\" +git-annex: /tmp: createDirectoryFrom: not located in .git: user error +[ble: exit 128] +``` + +This is what's created on the remote: + +``` +$ ls -la git-annex-rclone/ +insgesamt 24 +drwxr-xr-x 2 icg149 icg1 4096 Nov 4 15:48 . +drwx------ 81 icg149 icg1 4096 Nov 4 15:48 .. +-rw-r--r-- 1 icg149 icg1 3079 Nov 4 15:48 GITBUNDLE-s3079--85e65a54-6bff-49a7-9e62-db59d8912ceb-d34f9bc87f73892ce9cb135ebe0d0c9c98e01b8b7845f6ef15a6dae4bf45a8dc +-rw-r--r-- 1 icg149 icg1 119 Nov 4 15:48 GITMANIFEST--85e65a54-6bff-49a7-9e62-db59d8912ceb +-rw-r--r-- 1 icg149 icg1 119 Nov 4 15:48 GITMANIFEST--85e65a54-6bff-49a7-9e62-db59d8912ceb.bak +-rw-r--r-- 1 icg149 icg1 5 Nov 4 15:47 MD5E-s5--2b00042f7481c7b056c4b410d28f33cf.txt +``` +"""]]
Added a comment: possible fix
diff --git a/doc/forum/Stuck_in_git_annex_view__58___file_name_too_long/comment_1_902f5c6ccc6b65ecdbc89b3bf3a2468a._comment b/doc/forum/Stuck_in_git_annex_view__58___file_name_too_long/comment_1_902f5c6ccc6b65ecdbc89b3bf3a2468a._comment new file mode 100644 index 0000000000..db46974388 --- /dev/null +++ b/doc/forum/Stuck_in_git_annex_view__58___file_name_too_long/comment_1_902f5c6ccc6b65ecdbc89b3bf3a2468a._comment @@ -0,0 +1,25 @@ +[[!comment format=mdwn + username="psxvoid" + avatar="http://cdn.libravatar.org/avatar/fde068fbdeabeea31e3be7aa9c55d84b" + subject="possible fix" + date="2024-11-04T13:54:23Z" + content=""" +Hi, + +Seem like I've managed to fix it by openning `.git/HEAD` file with VIM, and then changing: + +``` +ref: refs/heads/views/main(tag=_) +``` + +to this: + +``` +ref: refs/heads/main +``` + +Saved the file, then ran `git stash --all`. Which returned the repo to the original state. + +Also before `git stash --all`, I ran `git restore .` `git annex fsck`, and `git add -A` but those didn't change anything. + +"""]]
Added a comment
diff --git a/doc/todo/copy__47__move_support_for_pushinsteadOf_/comment_5_a44a1cf7b7a96854ac5b40507865c355._comment b/doc/todo/copy__47__move_support_for_pushinsteadOf_/comment_5_a44a1cf7b7a96854ac5b40507865c355._comment new file mode 100644 index 0000000000..a870583f75 --- /dev/null +++ b/doc/todo/copy__47__move_support_for_pushinsteadOf_/comment_5_a44a1cf7b7a96854ac5b40507865c355._comment @@ -0,0 +1,12 @@ +[[!comment format=mdwn + username="yarikoptic" + avatar="http://cdn.libravatar.org/avatar/f11e9c84cb18d26a1748c33b48c924b4" + subject="comment 5" + date="2024-11-03T14:48:03Z" + content=""" +FWIW, I keep running into this. Re + +> But: If this change were made, it would risk breaking existing working setups, that happen to have a push url that points to a different repository. + +`pushurl` could take precedence, as overwrite the `pushInsteadOf` mapped value (did not check what git's behavior in presence of both pushurl and pushInsteadOf). +"""]]
diff --git a/doc/forum/Stuck_in_git_annex_view__58___file_name_too_long.mdwn b/doc/forum/Stuck_in_git_annex_view__58___file_name_too_long.mdwn index e274f8ff0e..28e64dccb7 100644 --- a/doc/forum/Stuck_in_git_annex_view__58___file_name_too_long.mdwn +++ b/doc/forum/Stuck_in_git_annex_view__58___file_name_too_long.mdwn @@ -21,7 +21,8 @@ getconf PATH_MAX /pool-name 4096 ``` -Steps to repro: +Steps to reproduce: + 1. create a file in a directory with a long path 1. create a tag 2. switch to a tag
diff --git a/doc/forum/Stuck_in_git_annex_view__58___file_name_too_long.mdwn b/doc/forum/Stuck_in_git_annex_view__58___file_name_too_long.mdwn new file mode 100644 index 0000000000..e274f8ff0e --- /dev/null +++ b/doc/forum/Stuck_in_git_annex_view__58___file_name_too_long.mdwn @@ -0,0 +1,42 @@ +Hi, + +I was experimenting with tags today, and seems like switching to a tag view broke my repository. Now, even `git status` gives me an error, output: + +``` +...vi.srt: File name too long +nothing to commit, working tree clean +On branch views/main(tag=_) +``` + +Before that I ran `git annex view tag=*`. + +What I'm thinking is that before switching to a tag view file name/path was fine, but during switching seems like git annex transformed the file name which exceeds the max file path. + +For ZFS those limits are: + +``` +getconf NAME_MAX /pool-name +255 +getconf PATH_MAX /pool-name +4096 +``` + +Steps to repro: +1. create a file in a directory with a long path +1. create a tag +2. switch to a tag + +for example, here is one of the errors: + +``` +size﹕huge/02_getting-a-handle-on-vectors_%knowledge-storages%courses%authorities%coursera%linear-algebra-machine-learning%01_introduction-to-linear-algebra-and-to-mathematics-for-machine-learning%02_the-relationship-between-machine-learning-linear-algebra-and-vectors-and%.pl.srt: File name too long +``` + +I've tried the following commands, but due to this error nothing changes: + +1. `git annex vpop` < says "git-annex: Not in a view." +2. `git checkout main` < does nothing, stays "On branch views/main(tag=_)" +3. `git restore .` < does nothing, stays "On branch views/main(tag=_)" + + +Is there anything can be done to at least go back to the main branch and clear the error?
removed
diff --git a/doc/forum/Repository_on_large_disk_server__44___browse_on_client/comment_2_8baf3d5ece8839ad61c24b7daa1ee83e._comment b/doc/forum/Repository_on_large_disk_server__44___browse_on_client/comment_2_8baf3d5ece8839ad61c24b7daa1ee83e._comment deleted file mode 100644 index 21a789d491..0000000000 --- a/doc/forum/Repository_on_large_disk_server__44___browse_on_client/comment_2_8baf3d5ece8839ad61c24b7daa1ee83e._comment +++ /dev/null @@ -1,12 +0,0 @@ -[[!comment format=mdwn - username="psxvoid" - avatar="http://cdn.libravatar.org/avatar/fde068fbdeabeea31e3be7aa9c55d84b" - subject="using fuse and nextcloud" - date="2024-11-03T13:44:57Z" - content=""" -Hi, - -One option is to use `bindfs --resolve-symlinks` FUSE to resolve simlinks, and then share the \"un-symlinked\" directory to whatever service you want, including NFS. - -Personally, I'm using `bindfs --resolve-symlinks` with `nextcloud` instance - works great. -"""]]
Added a comment: using fuse and nextcloud
diff --git a/doc/forum/Repository_on_large_disk_server__44___browse_on_client/comment_2_8baf3d5ece8839ad61c24b7daa1ee83e._comment b/doc/forum/Repository_on_large_disk_server__44___browse_on_client/comment_2_8baf3d5ece8839ad61c24b7daa1ee83e._comment new file mode 100644 index 0000000000..21a789d491 --- /dev/null +++ b/doc/forum/Repository_on_large_disk_server__44___browse_on_client/comment_2_8baf3d5ece8839ad61c24b7daa1ee83e._comment @@ -0,0 +1,12 @@ +[[!comment format=mdwn + username="psxvoid" + avatar="http://cdn.libravatar.org/avatar/fde068fbdeabeea31e3be7aa9c55d84b" + subject="using fuse and nextcloud" + date="2024-11-03T13:44:57Z" + content=""" +Hi, + +One option is to use `bindfs --resolve-symlinks` FUSE to resolve simlinks, and then share the \"un-symlinked\" directory to whatever service you want, including NFS. + +Personally, I'm using `bindfs --resolve-symlinks` with `nextcloud` instance - works great. +"""]]
Added a comment: using fuse and nextcloud
diff --git a/doc/forum/Repository_on_large_disk_server__44___browse_on_client/comment_1_6563a45d5303d5e65e88aa6df162f9fc._comment b/doc/forum/Repository_on_large_disk_server__44___browse_on_client/comment_1_6563a45d5303d5e65e88aa6df162f9fc._comment new file mode 100644 index 0000000000..624764e22c --- /dev/null +++ b/doc/forum/Repository_on_large_disk_server__44___browse_on_client/comment_1_6563a45d5303d5e65e88aa6df162f9fc._comment @@ -0,0 +1,12 @@ +[[!comment format=mdwn + username="psxvoid" + avatar="http://cdn.libravatar.org/avatar/fde068fbdeabeea31e3be7aa9c55d84b" + subject="using fuse and nextcloud" + date="2024-11-03T13:44:44Z" + content=""" +Hi, + +One option is to use `bindfs --resolve-symlinks` FUSE to resolve simlinks, and then share the \"un-symlinked\" directory to whatever service you want, including NFS. + +Personally, I'm using `bindfs --resolve-symlinks` with `nextcloud` instance - works great. +"""]]
Added a comment
diff --git a/doc/special_remotes/webdav/comment_26_15dec3ce98a03a39caac21f34bccbc0e._comment b/doc/special_remotes/webdav/comment_26_15dec3ce98a03a39caac21f34bccbc0e._comment new file mode 100644 index 0000000000..be4007889d --- /dev/null +++ b/doc/special_remotes/webdav/comment_26_15dec3ce98a03a39caac21f34bccbc0e._comment @@ -0,0 +1,8 @@ +[[!comment format=mdwn + username="yarikoptic" + avatar="http://cdn.libravatar.org/avatar/f11e9c84cb18d26a1748c33b48c924b4" + subject="comment 26" + date="2024-11-01T02:29:23Z" + content=""" +FWIW, since I [failed to install git-annex on boox](https://git-annex.branchable.com/bugs/install_on_android_boox__58___xargs_Permission_denied/), thought to attempt importtree/exporttree webdav special remote so was also displeased to see that importtree is not supported :-/ +"""]]
scaling
diff --git a/doc/thanks.mdwn b/doc/thanks.mdwn index 7707661bc7..9b06435e90 100644 --- a/doc/thanks.mdwn +++ b/doc/thanks.mdwn @@ -34,7 +34,7 @@ contributed good bug reports and great ideas. ## financial support, 2024 <img alt="OpenNeuro logo" src="https://raw.githubusercontent.com/OpenNeuroOrg/openneuro/1c1e0d3b2a2032729727702eb65b1b563eadce1d/packages/openneuro-components/src/assets/on-dark.svg" width=100> -<img alt="Standford wordmark" src="https://poldracklab.org/images/stanford-line2-8.png" height=100> +<img alt="Standford wordmark" src="https://poldracklab.org/images/stanford-line2-8.png" width=200> git-annex development is supported in large part by:
update
diff --git a/doc/thanks.mdwn b/doc/thanks.mdwn index aa0d090b4c..7707661bc7 100644 --- a/doc/thanks.mdwn +++ b/doc/thanks.mdwn @@ -33,8 +33,15 @@ contributed good bug reports and great ideas. ## financial support, 2024 +<img alt="OpenNeuro logo" src="https://raw.githubusercontent.com/OpenNeuroOrg/openneuro/1c1e0d3b2a2032729727702eb65b1b563eadce1d/packages/openneuro-components/src/assets/on-dark.svg" width=100> +<img alt="Standford wordmark" src="https://poldracklab.org/images/stanford-line2-8.png" height=100> + + git-annex development is supported in large part by: +* [OpenNeuro](https://openneuro.org/), funded by a + [BRAIN Initiative NIH grant](//reporter.nih.gov/project-details/10881961) + awarded to The Poldrack Lab at Stanford University. * [DANDI](https://www.dandiarchive.org/), funded by [a NIH grant](https://projectreporter.nih.gov/project_info_description.cfm?aid=9981835) awarded to MIT, Dartmouth College, and Kitware.
add news item for git-annex 10.20241031
diff --git a/doc/news/version_10.20240701.mdwn b/doc/news/version_10.20240701.mdwn deleted file mode 100644 index 83964ff8da..0000000000 --- a/doc/news/version_10.20240701.mdwn +++ /dev/null @@ -1,20 +0,0 @@ -git-annex 10.20240701 released with [[!toggle text="these changes"]] -[[!toggleable text=""" * git-annex remotes can now act as proxies that provide access to - their remotes. Configure this with remote.name.annex-proxy - and the git-annex update proxy command. - * Clusters are now supported. These are collections of nodes that can - be accessed as a single entity, accessed by one or more gateway - repositories. - * Added git-annex initcluster, updatecluster, and extendcluster commands. - * Fix a bug where interrupting git-annex while it is updating the - git-annex branch for an export could later lead to git fsck - complaining about missing tree objects. - * Tab completion of options like --from now includes special remotes, - as well as proxied remotes and clusters. - * Tab completion of many commands like info and trust now includes - remotes. - * P2P protocol version 2. - * Fix Windows build with Win32 2.13.4+ - Thanks, Oleg Tolmatcev - * When --debugfilter or annex.debugfilter is set, avoid propigating - debug output from git-annex-shell, since it cannot be filtered."""]] \ No newline at end of file diff --git a/doc/news/version_10.20241031.mdwn b/doc/news/version_10.20241031.mdwn new file mode 100644 index 0000000000..6e7fde8e94 --- /dev/null +++ b/doc/news/version_10.20241031.mdwn @@ -0,0 +1,25 @@ +git-annex 10.20241031 released with [[!toggle text="these changes"]] +[[!toggleable text=""" * Sped up proxied downloads from special remotes, by streaming. + * Added GETORDERED request to external special remote protocol. + When the external special remote responds with ORDERED, it can stream + through a proxy. + * p2phttp: Support serving unauthenticated users while requesting + authentication for operations that need it. Eg, --unauth-readonly + can be combined with --authenv. + * p2phttp: Allow unauthenticated users to lock content by default. + * p2phttp: Added --unauth-nolocking option to prevent unauthenticated + users from locking content. + * Allow enabling the servant build flag with older versions of stm, + allowing building with ghc 9.0.2. + * git-remote-annex: Fix bug that prevented using it with external special + remotes, leading to protocol error messages involving "GITMANIFEST". + * adjust: Allow any order of options when combining --hide-missing with + options like --unlock. + * Support P2P protocol version 4. This allows DATA-PRESENT to be sent + after PUT (and in the HTTP P2P protocol, v4/put has a data-present + parameter). When used with a proxy to a special remote like a S3 + bucket, this allows a custom client to upload content to S3 itself, + and then use the P2P protocol to inform the proxy that the content has + been stored there, which will result in the same git-annex branch state + updates as sending DATA via the proxy. + * Fix hang when receiving a large file into a proxied special remote."""]] \ No newline at end of file
oops, add the new todos meant to be in prev commit
diff --git a/doc/todo/assistant_does_not_use_LiveUpdate.mdwn b/doc/todo/assistant_does_not_use_LiveUpdate.mdwn new file mode 100644 index 0000000000..e1b8dcb3a4 --- /dev/null +++ b/doc/todo/assistant_does_not_use_LiveUpdate.mdwn @@ -0,0 +1,8 @@ +The assistant is using NoLiveUpdate, but it should be posssible to plumb +a LiveUpdate through it from preferred content checking to location log +updating. + +The benefit would be when using balanced preferred content expressions, +the assistant would get live updates about repo sizes. + +(This is a deferred item from the [[todo/git-annex_proxies]] megatodo.) --[[Joey]] diff --git a/doc/todo/faster_proxying.mdwn b/doc/todo/faster_proxying.mdwn new file mode 100644 index 0000000000..83f8b40d76 --- /dev/null +++ b/doc/todo/faster_proxying.mdwn @@ -0,0 +1,35 @@ +Not that proxying is super slow, but it does involve bouncing content +through the proxy, and could be made faster. Some ideas: + +* A proxy to a local git repository spawns git-annex-shell + to communicate with it. It would be more efficient to operate + directly on the Remote. Especially when transferring content to/from it. + But: When a cluster has several nodes that are local git repositories, + and is sending data to all of them, this would need an alternate + interface than `storeKey`, which supports streaming, of chunks + of a ByteString. + +* Use `sendfile()` to avoid data copying overhead when + `receiveBytes` is being fed right into `sendBytes`. + Library to use: + <https://hackage.haskell.org/package/hsyscall-0.4/docs/System-Syscall.html> + +* Getting a key from a cluster currently picks from amoung + the lowest cost nodes at random. This could be smarter, + eg prefer to avoid using nodes that are doing other transfers at the + same time. + +* The cost of a proxied node that is accessed via an intermediate gateway + is currently the same as a node accessed via the cluster gateway. So in + such a situation, git-annex may make a suboptimal choice of path. + To fix this, there needs to be some way to tell how many hops through + gateways it takes to get to a node. Currently the only way is to + guess based on number of dashes in the node name, which is not satisfying. + + Even counting hops is not very satisfying, one cluster gateway could + be much more expensive to traverse than another one. + + If seriously tackling this, it might be worth making enough information + available to use spanning tree protocol for routing inside clusters. + +(This is a deferred item from the [[todo/git-annex_proxies]] megatodo.) --[[Joey]] diff --git a/doc/todo/git-remote-annex_support_for_p2phttp.mdwn b/doc/todo/git-remote-annex_support_for_p2phttp.mdwn new file mode 100644 index 0000000000..86703190ac --- /dev/null +++ b/doc/todo/git-remote-annex_support_for_p2phttp.mdwn @@ -0,0 +1,12 @@ +Should be possible to use a git-remote-annex annex::$uuid url as +remote.foo.url with remote.foo.annexUrl using annex+http, and so +not need a separate web server to serve the git repository when using +`git-annex p2phttp`. + +Doesn't work currently because git-remote-annex urls only support +special remotes. + +It would need a new form of git-remote-annex url, eg: +annex::$uuid?annex+http://example.com/git-annex/ + +(This is a deferred item from the [[todo/git-annex_proxies]] megatodo.) --[[Joey]] diff --git a/doc/todo/proxying_for_p2phttp_and_tor-annex_remotes.mdwn b/doc/todo/proxying_for_p2phttp_and_tor-annex_remotes.mdwn new file mode 100644 index 0000000000..6cf21033b5 --- /dev/null +++ b/doc/todo/proxying_for_p2phttp_and_tor-annex_remotes.mdwn @@ -0,0 +1,13 @@ +git-annex can proxy for remotes that are accessed locally or over +ssh, as well as special remotes. But, it cannot proxy for remotes that +themselves have a annex+http annexUrl. + +This would need a translation from P2P protocol to servant client. +Should not be very hard to implement if someone needs it for some reason. + +Also, git-annex could support proxying to remotes whose url is a P2P +address. Eg, tor-annex remotes. This only needs a way to +generate a RemoteSide for them. + +(This is a deferred item from the [[todo/git-annex_proxies]] megatodo.) --[[Joey]] + diff --git a/doc/todo/smarter_use_of_disk_when_proxying.mdwn b/doc/todo/smarter_use_of_disk_when_proxying.mdwn new file mode 100644 index 0000000000..e2d68c8fd1 --- /dev/null +++ b/doc/todo/smarter_use_of_disk_when_proxying.mdwn @@ -0,0 +1,26 @@ +When proxying for a special remote, downloads can stream in from it and out +the proxy, but that does happen via a temporary file, which grows to the +full size of the file being downloaded. And uploads to a special get buffered to a +temporary file. + +It would be nice to do full streaming without temp files, but also it's a +hard change to make. + +Some improvements that could be made without making such a big change: + +* When an upload to a cluster is distributed to multiple special remotes, + a temporary file is written for each one, which may even happen in + parallel. This is a lot of extra work and may use excess disk space. + It should be possible to only write a single temp file. + +* Check annex.diskreserve when proxying for special remotes + to avoid the proxy's disk filling up with the temporary object file + cached there. + +* Resuming an interrupted download from proxied special remote makes the proxy + re-download the whole content. It could instead keep some of the + object files around when the client does not send SUCCESS. This would + use more disk, but could minimize to eg, the last 2 or so. + The [[design/passthrough_proxy]] design doc has some more thoughts about this. + +(This is a deferred item from the [[todo/git-annex_proxies]] megatodo.) --[[Joey]]
split up remaining items from todo/git-annex_proxies and close it!
diff --git a/Database/RepoSize.hs b/Database/RepoSize.hs index 25a2aca44a..13c7d7ebba 100644 --- a/Database/RepoSize.hs +++ b/Database/RepoSize.hs @@ -400,6 +400,11 @@ liveRepoOffsets (RepoSizeHandle (Just h) _) wantedsizechange = H.queryDb h $ do map (\(k, v) -> (k, [v])) $ fromMaybe [] $ M.lookup u livechanges + -- This could be optimised to a single SQL join, rather + -- than querying once for each live change. That would make + -- it less expensive when there are a lot happening at the + -- same time. Persistent is not capable of that join, + -- it would need a dependency on esquelito. livechanges' <- combinelikelivechanges <$> filterM (nonredundantlivechange livechangesbykey u) (fromMaybe [] $ M.lookup u livechanges) diff --git a/doc/todo/git-annex_info_with_limit_overcounts.mdwn b/doc/todo/git-annex_info_with_limit_overcounts.mdwn new file mode 100644 index 0000000000..13066ed0b7 --- /dev/null +++ b/doc/todo/git-annex_info_with_limit_overcounts.mdwn @@ -0,0 +1,7 @@ +`git-annex info` in the limitedcalc path in cachedAllRepoData +double-counts redundant information from the journal due to using +overLocationLogs. In the other path it does not (any more; it used to but +live repo sizes fixed that), and this should be fixed for consistency +and correctness. + +(This is a deferred item from the [[todo/git-annex_proxies]] megatodo.) --[[Joey]] diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index bb38166c19..fc6b180aa0 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -1,4 +1,4 @@ -This is a summary todo covering several subprojects, which would extend +This is a summary todo covering several subprojects, which extend git-annex to be able to use proxies which sit in front of a cluster of repositories. @@ -12,7 +12,7 @@ repositories. [[!toc ]] -## planned schedule +## plan Joey has received funding to work on this. Planned schedule of work: @@ -24,94 +24,27 @@ Planned schedule of work: * September: proving behavior of balanced preferred content with proxies * October: streaming through proxy to special remotes (especially S3) -[[!tag projects/openneuro]] - -## remaining things to do in October - -* Possibly some of the deferred items listed in following sections: - -## items deferred until later for balanced preferred content and maxsize tracking - -* The assistant is using NoLiveUpdate, but it should be posssible to plumb - a LiveUpdate through it from preferred content checking to location log - updating. - -* `git-annex info` in the limitedcalc path in cachedAllRepoData - double-counts redundant information from the journal due to using - overLocationLogs. In the other path it does not (any more; it used to), - and this should be fixed for consistency and correctness. - -* getLiveRepoSizes has a filterM getRecentChange over the live updates. - This could be optimised to a single sql join. There are usually not many - live updates, but sometimes there will be a great many recent changes, - so it might be worth doing this optimisation. Persistent is not capable - of this, would need dependency added on esquelito. - -## items deferred until later for p2p protocol over http +> This project is now complete! [[done]] --[[Joey]] -* Support proxying to git remotes that use annex+http urls. This needs a - translation from P2P protocol to servant-client to P2P protocol. - -* Should be possible to use a git-remote-annex annex::$uuid url as - remote.foo.url with remote.foo.annexUrl using annex+http, and so - not need a separate web server to serve the git repository. Doesn't work - currently because git-remote-annex urls only support special remotes. - It would need a new form of git-remote-annex url, eg: - annex::$uuid?annex+http://example.com/git-annex/ - -* `git-annex p2phttp` could support systemd socket activation. This would - allow making a systemd unit that listens on port 80. - -## items deferred until later for [[design/passthrough_proxy]] - -* Check annex.diskreserve when proxying for special remotes - to avoid the proxy's disk filling up with the temporary object file - cached there. - -* Resuming an interrupted download from proxied special remote makes the proxy - re-download the whole content. It could instead keep some of the - object files around when the client does not send SUCCESS. This would - use more disk, but could minimize to eg, the last 2 or so. - The design doc has some more thoughts about this. - -* Getting a key from a cluster currently picks from amoung - the lowest cost remotes at random. This could be smarter, - eg prefer to avoid using remotes that are doing other transfers at the - same time. +[[!tag projects/openneuro]] -* The cost of a proxied node that is accessed via an intermediate gateway - is currently the same as a node accessed via the cluster gateway. - To fix this, there needs to be some way to tell how many hops through - gateways it takes to get to a node. Currently the only way is to - guess based on number of dashes in the node name, which is not satisfying. +## some todos that spun off from this project and didn't get implemented during it: - Even counting hops is not very satisfying, one cluster gateway could - be much more expensive to traverse than another one. +For balanced preferred content and maxsize tracking: - If seriously tackling this, it might be worth making enough information - available to use spanning tree protocol for routing inside clusters. +* [[todo/assistant_does_not_use_LiveUpdate]] +* [[todo/git-annex_info_with_limit_overcounts]] -* Speed: A proxy to a local git repository spawns git-annex-shell - to communicate with it. It would be more efficient to operate - directly on the Remote. Especially when transferring content to/from it. - But: When a cluster has several nodes that are local git repositories, - and is sending data to all of them, this would need an alternate - interface than `storeKey`, which supports streaming, of chunks - of a ByteString. +For p2p protocol over http: -* Use `sendfile()` to avoid data copying overhead when - `receiveBytes` is being fed right into `sendBytes`. - Library to use: - <https://hackage.haskell.org/package/hsyscall-0.4/docs/System-Syscall.html> +* [[p2phttp_serve_multiple_repositories]] +* [[git-remote-annex_support_for_p2phttp]] -* Support using a proxy when its url is a P2P address. - (Eg tor-annex remotes.) +For proxying: -* When an upload to a cluster is distributed to multiple special remotes, - a temporary file is written for each one, which may even happen in - parallel. This is a lot of extra work and may use excess disk space. - It should be possible to only write a single temp file. - (With streaming this wouldn't be an issue.) +* [[proxying_for_p2phttp_and_tor-annex_remotes]] +* [[faster_proxying]] +* [[smarter_use_of_disk_when_proxying]] ## completed items for October's work on streaming through proxy to special remotes
link
diff --git a/doc/tips/client_side_upload_to_a_special_remote.mdwn b/doc/tips/client_side_upload_to_a_special_remote.mdwn index c31a4156fe..f619c07ad4 100644 --- a/doc/tips/client_side_upload_to_a_special_remote.mdwn +++ b/doc/tips/client_side_upload_to_a_special_remote.mdwn @@ -48,7 +48,7 @@ All that's left is to let git-annex know that the file has been uploaded to the S3 special remote. To accomplish this, the web browser will need to talk with git-annex on the server. The easy way to accomplish that is to run [[git-annex p2phttp|git-annex-p2phttp]]. -The web browser will be speaking the [[doc/design/P2P_protocol_over_HTTP]]. +The web browser will be speaking the [[design/P2P_protocol_over_HTTP]]. Make sure you have git-annex 10.20241031 or newer installed. That version extended the [[design/p2p_protocol]] with a `DATA-PRESENT` feature, which
fix link
diff --git a/doc/tips/client_side_upload_to_a_special_remote.mdwn b/doc/tips/client_side_upload_to_a_special_remote.mdwn index 25df9bf30e..c31a4156fe 100644 --- a/doc/tips/client_side_upload_to_a_special_remote.mdwn +++ b/doc/tips/client_side_upload_to_a_special_remote.mdwn @@ -24,7 +24,7 @@ bucket. Along with the S3 bucket, you will need a server set up, which is where git-annex will run in a git repository. Set up the S3 special remote there. And make git-annex on the server a -[proxy|git-annex-updateproxy]] for the S3 special remote: +[[proxy|git-annex-updateproxy]] for the S3 special remote: git-annex initremote s3 type=S3 exporttree=yes encryption=none bucket=mybucket git config remote.s3.annex-proxy true
DATA-PRESENT working for exporttree=yes remotes
Since the annex-tracking-branch is pushed first, git-annex has already
updated the export database when the DATA-PRESENT arrives. Which means
that just using checkPresent is enough to verify that there is some file
on the special remote in the export location for the key.
So, the simplest possible implementation of this happened to work!
(I also tested it with chunked specialremotes, which also works, as long
as the chunk size used is the same as the configured chunk size. In that
case, the lack of a chunk log is not a problem. Doubtful this will ever
make sense to use with a chunked special remote though, that gets pretty
deep into re-implementing git-annex.)
Updated the client side upload tip with a missing step, and reorged for clarity.
Since the annex-tracking-branch is pushed first, git-annex has already
updated the export database when the DATA-PRESENT arrives. Which means
that just using checkPresent is enough to verify that there is some file
on the special remote in the export location for the key.
So, the simplest possible implementation of this happened to work!
(I also tested it with chunked specialremotes, which also works, as long
as the chunk size used is the same as the configured chunk size. In that
case, the lack of a chunk log is not a problem. Doubtful this will ever
make sense to use with a chunked special remote though, that gets pretty
deep into re-implementing git-annex.)
Updated the client side upload tip with a missing step, and reorged for clarity.
diff --git a/Annex/Proxy.hs b/Annex/Proxy.hs index 546c61dd0c..4f11f617c9 100644 --- a/Annex/Proxy.hs +++ b/Annex/Proxy.hs @@ -112,10 +112,7 @@ proxySpecialRemote protoversion r ihdl ohdl owaitv oclosedv mexportdb = go go :: Annex () go = liftIO receivemessage >>= \case Just (CHECKPRESENT k) -> do - tryNonAsync (Remote.checkPresent r k) >>= \case - Right True -> liftIO $ sendmessage SUCCESS - Right False -> liftIO $ sendmessage FAILURE - Left err -> liftIO $ propagateerror err + checkpresent k go Just (LOCKCONTENT _) -> do -- Special remotes do not support locking content. @@ -211,22 +208,14 @@ proxySpecialRemote protoversion r ihdl ohdl owaitv oclosedv mexportdb = go nuketmp giveup "protocol error" else store >> nuketmp - Just DATA_PRESENT -> tryNonAsync (verifydatapresent k) >>= \case - Right True -> liftIO $ sendmessage SUCCESS - Right False -> liftIO $ sendmessage FAILURE - Left err -> liftIO $ propagateerror err + Just DATA_PRESENT -> checkpresent k _ -> giveup "protocol error" - verifydatapresent k = case mexportdb of - Just exportdb -> liftIO (Export.getExportTree exportdb k) >>= \case - [] -> verifykey - -- XXX TODO check that one of the export locs is populated, - -- or for an annexobjects=yes special remote, the - -- annexobject file could be populated. - locs -> return True - Nothing -> verifykey - where - verifykey = Remote.checkPresent r k + checkpresent k = + tryNonAsync (Remote.checkPresent r k) >>= \case + Right True -> liftIO $ sendmessage SUCCESS + Right False -> liftIO $ sendmessage FAILURE + Left err -> liftIO $ propagateerror err storeput k af tmpfile = case mexportdb of Just exportdb -> liftIO (Export.getExportTree exportdb k) >>= \case diff --git a/doc/tips/client_side_upload_to_a_special_remote.mdwn b/doc/tips/client_side_upload_to_a_special_remote.mdwn index 0982850d5a..25df9bf30e 100644 --- a/doc/tips/client_side_upload_to_a_special_remote.mdwn +++ b/doc/tips/client_side_upload_to_a_special_remote.mdwn @@ -21,38 +21,45 @@ and then you can upload whatever filenames you want to it, rather than needing to use the same filenames git-annex uses for storing keys in a S3 bucket. +Along with the S3 bucket, you will need a server set up, which is where +git-annex will run in a git repository. Set up the S3 special remote there. +And make git-annex on the server a +[proxy|git-annex-updateproxy]] for the S3 special remote: + + git-annex initremote s3 type=S3 exporttree=yes encryption=none bucket=mybucket + git config remote.s3.annex-proxy true + git-annex updateproxy + +If the special remote is configured with exporttree=yes, be sure to also +configure the annex-tracking-branch for it on the server: + + git config remote.s3.annex-tracking-branch master + Once the browser uploads the file to S3, you need to add a git-annex symlink or pointer file to the git repository. This can be done in the browser, using [js-git](https://github.com/creationix/js-git). Generating a git-annex key is not hard, just hash the file content before/while uploading it, and see [[internals/key_format]]. Write that to a pointer file, or make a symlink to the appropriate directory under -.git/annex/objects (a bit harder). Commit it to git and push to your -server using js-git. - -Now git-annex knows about the file. But it doesn't yet know it's been -uploaded to the S3 special remote. To do this, you will need have your -server set up to run git-annex. Set up the S3 special -remote there. And make git-annex on the server a -[proxy|git-annex-updateproxy]] for the S3 special remote: - - git-annex initremote s3 type=S3 exporttree=yes encryption=none bucket=mybucket - git config remote.s3.annex-proxy true - git-annex updateproxy +.git/annex/objects (a bit harder). Commit it to git and push the branch +("master" in this example) to your server using js-git. -For the web browser to be able to easily talk with git-annex on the server, -you can run [[git-annex p2phttp|git-annex-p2phttp]]. +All that's left is to let git-annex know that the file has been uploaded to +the S3 special remote. To accomplish this, the web browser will need to +talk with git-annex on the server. The easy way to accomplish that +is to run [[git-annex p2phttp|git-annex-p2phttp]]. The web browser will be speaking the [[doc/design/P2P_protocol_over_HTTP]]. Make sure you have git-annex 10.20241031 or newer installed. That version extended the [[design/p2p_protocol]] with a `DATA-PRESENT` feature, which is just what you need. -All the web browser needs to do is `POST /git-annex/$uuid/v4/put` -with `data-present=true` included in the URL parameters, along with the -key of the file that was added to the git repository. -Replace `$uuid` with the UUID of the S3 special remote. -You can look that up with eg `git config remote.s3.annex-uuid`. +All the web browser needs to do, after uploading the S3 and pushing the git +branch to the server, is `POST /git-annex/$uuid/v4/put` with +`data-present=true` included in the URL parameters, along with the key of +the file that was added to the git repository. Replace `$uuid` with the +UUID of the S3 special remote. You can look that up with eg `git config +remote.s3.annex-uuid`. When the git-annex HTTP server receives that request, since it is configured to be able to proxy for the S3 special remote, it will act the diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index 6fa103e933..bb38166c19 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -28,22 +28,6 @@ Planned schedule of work: ## remaining things to do in October -* Streaming uploads to special remotes via the proxy. Possibly; if a - workable design can be developed. It seems difficult without changing the - external special remote protocol, unless a fifo is used. Make ORDERED - response in p2p protocol allow using a fifo? - -* Indirect uploads when proxying for special remote is an alternative that - would work for OpenNeuro's use case. - -* If not implementing upload streaming to proxied special remotes, - this needs to be addressed: - When an upload to a cluster is distributed to multiple special remotes, - a temporary file is written for each one, which may even happen in - parallel. This is a lot of extra work and may use excess disk space. - It should be possible to only write a single temp file. - (With streaming this wouldn't be an issue.) - * Possibly some of the deferred items listed in following sections: ## items deferred until later for balanced preferred content and maxsize tracking @@ -123,6 +107,12 @@ Planned schedule of work: * Support using a proxy when its url is a P2P address. (Eg tor-annex remotes.) +* When an upload to a cluster is distributed to multiple special remotes, + a temporary file is written for each one, which may even happen in + parallel. This is a lot of extra work and may use excess disk space. + It should be possible to only write a single temp file. + (With streaming this wouldn't be an issue.) + ## completed items for October's work on streaming through proxy to special remotes * Stream downloads through proxy for all special remotes that indicate
Merge branch 'master' into p2pv4
break out todo item
diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index b355dd6755..1c5e70d04c 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -65,10 +65,6 @@ Planned schedule of work: ## items deferred until later for p2p protocol over http -* `git-annex p2phttp` should support serving several repositories at the same - time (not as proxied remotes), so that eg, every git-annex repository - on a server can be served on the same port. - * Support proxying to git remotes that use annex+http urls. This needs a translation from P2P protocol to servant-client to P2P protocol. diff --git a/doc/todo/p2phttp_serve_multiple_repositories.mdwn b/doc/todo/p2phttp_serve_multiple_repositories.mdwn new file mode 100644 index 0000000000..f2ad9c752e --- /dev/null +++ b/doc/todo/p2phttp_serve_multiple_repositories.mdwn @@ -0,0 +1,21 @@ +git-annex p2phttp should be able to serve multiple git repositories, eg all +the ones in a directory. + +* Add a --directory option. +* When a request is received for a uuid that is not one it already knows + about, it can rescan the directories to find new repositories that have + been added. +* Removing a repository that it is serving should be ok, git-annex doesn't + mkdir recursively down to the git repository level. So any running + threads that are serving the removed repository will not recreate it, + and will fail out. +* Removing a repository and then re-creating it with a different UUID + though would be problimatic and probably the documentation would need to + say to not do that, restart git-annex p2phttp after deletion in that + case. + +I asked matrss if this would be useful for forgejo-aneksajo and he said +"Serving more than one repository per p2phttp instance would probably be +very useful, although I think I can work with the limitation [of only 1]." + +[[!tag projects/INM7]]
add tip for DATA-PRESENT feature
diff --git a/CHANGELOG b/CHANGELOG index f3aa278e30..07c9df1aea 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,4 @@ -git-annex (10.20240928) UNRELEASED; urgency=medium +git-annex (10.20241031) UNRELEASED; urgency=medium * Sped up proxied downloads from special remotes, by streaming. * Added GETORDERED request to external special remote protocol. diff --git a/doc/tips/client_side_upload_to_a_special_remote.mdwn b/doc/tips/client_side_upload_to_a_special_remote.mdwn new file mode 100644 index 0000000000..0982850d5a --- /dev/null +++ b/doc/tips/client_side_upload_to_a_special_remote.mdwn @@ -0,0 +1,67 @@ +Suppose you are gathering files from users on the web and want to ingest +that data into a git-annex repository, with a special remote that is eg, a +S3 bucket. + +You could have the web browser upload to your server, and run git-annex +there, to add it to the git repository, and move it on to the S3 bucket. +That is innefficient though, the file goes into the server and back out, +and needs to be spooled to the server's disk as well. + +This page shows a more efficient way to do it, where the web browser +uploads directly to S3, and a git-annex repository is updated accordingly. +There is not (currently) a way to run git-annex in a web browser. +So you will need to write some custom code to do this. But with the +method described here, you won't need to re-implement all of git-annex in +the web browser. + +Uploading from the browser to S3 is left an an exercise to the reader. +All that matters really is, what filename to use in the S3 bucket? It's +simplest to make the S3 special remote an exporttree=yes special remote, +and then you can upload whatever filenames you want to it, rather than +needing to use the same filenames git-annex uses for storing keys in a S3 +bucket. + +Once the browser uploads the file to S3, you need to add a git-annex +symlink or pointer file to the git repository. This can be done in the +browser, using [js-git](https://github.com/creationix/js-git). Generating a +git-annex key is not hard, just hash the file content before/while +uploading it, and see [[internals/key_format]]. Write that to a pointer +file, or make a symlink to the appropriate directory under +.git/annex/objects (a bit harder). Commit it to git and push to your +server using js-git. + +Now git-annex knows about the file. But it doesn't yet know it's been +uploaded to the S3 special remote. To do this, you will need have your +server set up to run git-annex. Set up the S3 special +remote there. And make git-annex on the server a +[proxy|git-annex-updateproxy]] for the S3 special remote: + + git-annex initremote s3 type=S3 exporttree=yes encryption=none bucket=mybucket + git config remote.s3.annex-proxy true + git-annex updateproxy + +For the web browser to be able to easily talk with git-annex on the server, +you can run [[git-annex p2phttp|git-annex-p2phttp]]. +The web browser will be speaking the [[doc/design/P2P_protocol_over_HTTP]]. + +Make sure you have git-annex 10.20241031 or newer installed. That version +extended the [[design/p2p_protocol]] with a `DATA-PRESENT` feature, which +is just what you need. + +All the web browser needs to do is `POST /git-annex/$uuid/v4/put` +with `data-present=true` included in the URL parameters, along with the +key of the file that was added to the git repository. +Replace `$uuid` with the UUID of the S3 special remote. +You can look that up with eg `git config remote.s3.annex-uuid`. + +When the git-annex HTTP server receives that request, since it is +configured to be able to proxy for the S3 special remote, it will act the +same as if the content of the file had been sent in the request. But thanks +to `data-present=true`, it knows the data is already in the S3 special +remote. So it updates the git-annex branch to reflect that the file is +stored there. + +Now if someone else clones the git repository, they can `git-annex get` the +file, and it will be downloaded from the S3 bucket, if that bucket is +configured to let them read it. Your server never needs to deal with the +content of the file. diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index b355dd6755..1ab94e7e01 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -127,12 +127,13 @@ Planned schedule of work: * Support using a proxy when its url is a P2P address. (Eg tor-annex remotes.) - ## completed items for October's work on streaming through proxy to special remotes * Stream downloads through proxy for all special remotes that indicate they download in order. * Added ORDERED message to external special remote protocol. +* Added DATA-PRESENT and documented in + [[tips/client_side_upload_to_a_special_remote]] ## completed items for September's work on proving behavior of preferred content
implement put data-present parameter in http servant
Changed the protocol docs because servant parses "true" and "false" for
booleans in query parameters, not "1" and "0".
clientPut with datapresent=True is not used by git-annex, and I don't
anticipate it being used in git-annex, except for testing.
I've tested this by making clientPut be called with datapresent=True and
git-annex copy to a remote succeeds once the object file is first
manually copied to the remote. That would be a good test for the test
suite, but running the http client means exposing it to at least
localhost, and would fail if a real http client was already running on
that port.
Changed the protocol docs because servant parses "true" and "false" for
booleans in query parameters, not "1" and "0".
clientPut with datapresent=True is not used by git-annex, and I don't
anticipate it being used in git-annex, except for testing.
I've tested this by making clientPut be called with datapresent=True and
git-annex copy to a remote succeeds once the object file is first
manually copied to the remote. That would be a good test for the test
suite, but running the http client means exposing it to at least
localhost, and would fail if a real http client was already running on
that port.
diff --git a/P2P/Http.hs b/P2P/Http.hs index a03b5a5f1a..37fe4c067c 100644 --- a/P2P/Http.hs +++ b/P2P/Http.hs @@ -44,7 +44,9 @@ type P2PHttpAPI :<|> "git-annex" :> SU :> PV3 :> "remove-before" :> RemoveBeforeAPI :<|> "git-annex" :> SU :> PV4 :> "gettimestamp" :> GetTimestampAPI :<|> "git-annex" :> SU :> PV3 :> "gettimestamp" :> GetTimestampAPI - :<|> "git-annex" :> SU :> PV4 :> "put" :> PutAPI PutResultPlus + :<|> "git-annex" :> SU :> PV4 :> "put" + :> QueryParam "data-present" Bool + :> PutAPI PutResultPlus :<|> "git-annex" :> SU :> PV3 :> "put" :> PutAPI PutResultPlus :<|> "git-annex" :> SU :> PV2 :> "put" :> PutAPI PutResultPlus :<|> "git-annex" :> SU :> PV1 :> "put" :> PutAPI PutResult diff --git a/P2P/Http/Client.hs b/P2P/Http/Client.hs index d9c2c71f6b..d047eca7a0 100644 --- a/P2P/Http/Client.hs +++ b/P2P/Http/Client.hs @@ -329,27 +329,32 @@ clientPut -> FileSize -> Annex Bool -- ^ Called after sending the file to check if it's valid. + -> Bool + -- ^ Set data-present parameter and do not actually send data + -- (v4+ only) -> ClientAction PutResultPlus #ifdef WITH_SERVANT -clientPut meterupdate k moffset af contentfile contentfilesize validitycheck clientenv (ProtocolVersion ver) su cu bypass auth = do - checkv <- liftIO newEmptyTMVarIO - checkresultv <- liftIO newEmptyTMVarIO - let checker = do - liftIO $ atomically $ takeTMVar checkv - validitycheck >>= liftIO . atomically . putTMVar checkresultv - checkerthread <- liftIO . async =<< forkState checker - v <- liftIO $ withBinaryFile contentfile ReadMode $ \h -> do - when (offset /= 0) $ - hSeek h AbsoluteSeek offset - withClientM (cli (stream h checkv checkresultv)) clientenv return - case v of - Left err -> do - void $ liftIO $ atomically $ tryPutTMVar checkv () - join $ liftIO (wait checkerthread) - return (Left err) - Right res -> do - join $ liftIO (wait checkerthread) - return (Right res) +clientPut meterupdate k moffset af contentfile contentfilesize validitycheck datapresent clientenv (ProtocolVersion ver) su cu bypass auth + | datapresent = liftIO $ withClientM (cli mempty) clientenv return + | otherwise = do + checkv <- liftIO newEmptyTMVarIO + checkresultv <- liftIO newEmptyTMVarIO + let checker = do + liftIO $ atomically $ takeTMVar checkv + validitycheck >>= liftIO . atomically . putTMVar checkresultv + checkerthread <- liftIO . async =<< forkState checker + v <- liftIO $ withBinaryFile contentfile ReadMode $ \h -> do + when (offset /= 0) $ + hSeek h AbsoluteSeek offset + withClientM (cli (stream h checkv checkresultv)) clientenv return + case v of + Left err -> do + void $ liftIO $ atomically $ tryPutTMVar checkv () + join $ liftIO (wait checkerthread) + return (Left err) + Right res -> do + join $ liftIO (wait checkerthread) + return (Right res) where stream h checkv checkresultv = S.SourceT $ \a -> do bl <- hGetContentsMetered h meterupdate @@ -401,7 +406,7 @@ clientPut meterupdate k moffset af contentfile contentfilesize validitycheck cli bk = B64Key k cli src = case ver of - 4 -> v4 su V4 len bk cu bypass baf moffset src auth + 4 -> v4 su V4 (if datapresent then Just True else Nothing) len bk cu bypass baf moffset src auth 3 -> v3 su V3 len bk cu bypass baf moffset src auth 2 -> v2 su V2 len bk cu bypass baf moffset src auth 1 -> plus <$> v1 su V1 len bk cu bypass baf moffset src auth diff --git a/P2P/Http/Server.hs b/P2P/Http/Server.hs index 5ec5438a50..eaf87f36b4 100644 --- a/P2P/Http/Server.hs +++ b/P2P/Http/Server.hs @@ -26,6 +26,7 @@ import P2P.Http import P2P.Http.Types import P2P.Http.State import P2P.Protocol hiding (Offset, Bypass, auth) +import qualified P2P.Protocol import P2P.IO import P2P.Annex import Annex.WorkerPool @@ -69,10 +70,10 @@ serveP2pHttp st :<|> serveGetTimestamp st :<|> serveGetTimestamp st :<|> servePut st id - :<|> servePut st id - :<|> servePut st id - :<|> servePut st dePlus - :<|> servePut st dePlus + :<|> servePut' st id + :<|> servePut' st id + :<|> servePut' st dePlus + :<|> servePut' st dePlus :<|> servePutOffset st id :<|> servePutOffset st id :<|> servePutOffset st id @@ -307,6 +308,7 @@ servePut -> (PutResultPlus -> t) -> B64UUID ServerSide -> v + -> Maybe Bool -> DataLength -> B64Key -> B64UUID ClientSide @@ -317,7 +319,15 @@ servePut -> IsSecure -> Maybe Auth -> Handler t -servePut st resultmangle su apiver (DataLength len) (B64Key k) cu bypass baf moffset stream sec auth = do +servePut st resultmangle su apiver (Just True) _ k cu bypass baf _ _ sec auth = do + res <- withP2PConnection' apiver st cu su bypass sec auth WriteAction + (\cst -> cst { connectionWaitVar = False }) (liftIO . protoaction) + servePutResult resultmangle res + where + protoaction conn = servePutAction st conn k baf $ \_offset -> do + net $ sendMessage DATA_PRESENT + checkSuccessPlus +servePut st resultmangle su apiver _datapresent (DataLength len) k cu bypass baf moffset stream sec auth = do validityv <- liftIO newEmptyTMVarIO let validitycheck = local $ runValidityCheck $ liftIO $ atomically $ readTMVar validityv @@ -327,41 +337,27 @@ servePut st resultmangle su apiver (DataLength len) (B64Key k) cu bypass baf mof (\cst -> cst { connectionWaitVar = False }) $ \conn -> do liftIO $ void $ async $ checktooshort conn tooshortv liftIO (protoaction conn content validitycheck) - case res of - Right (Right (Just plusuuids)) -> return $ resultmangle $ - PutResultPlus True (map B64UUID plusuuids) - Right (Right Nothing) -> return $ resultmangle $ - PutResultPlus False [] - Right (Left protofail) -> throwError $ - err500 { errBody = encodeBL (describeProtoFailure protofail) } - Left err -> throwError $ - err500 { errBody = encodeBL (show err) } + servePutResult resultmangle res where - protoaction conn content validitycheck = inAnnexWorker st $ - enteringStage (TransferStage Download) $ - runFullProto (clientRunState conn) (clientP2PConnection conn) $ - protoaction' content validitycheck - - protoaction' content validitycheck = put' k af $ \offset' -> - let offsetdelta = offset' - offset - in case compare offset' offset of - EQ -> sendContent' nullMeterUpdate (Len len) - content validitycheck - GT -> sendContent' nullMeterUpdate - (Len (len - fromIntegral offsetdelta)) - (L.drop (fromIntegral offsetdelta) content) - validitycheck - LT -> sendContent' nullMeterUpdate - (Len len) - content - (validitycheck >>= \_ -> return Invalid) + protoaction conn content validitycheck = + servePutAction st conn k baf $ \offset' -> + let offsetdelta = offset' - offset + in case compare offset' offset of + EQ -> sendContent' nullMeterUpdate (Len len) + content validitycheck + GT -> sendContent' nullMeterUpdate + (Len (len - fromIntegral offsetdelta)) + (L.drop (fromIntegral offsetdelta) content) + validitycheck + LT -> sendContent' nullMeterUpdate + (Len len) + content + (validitycheck >>= \_ -> return Invalid) offset = case moffset of Just (Offset o) -> o Nothing -> 0 - af = b64FilePathToAssociatedFile baf - -- Streams the ByteString from the client. Avoids returning a longer -- than expected ByteString by truncating to the expected length. -- Returns a shorter than expected ByteString when the data is not @@ -399,6 +395,49 @@ servePut st resultmangle su apiver (DataLength len) (B64Key k) cu bypass baf mof liftIO $ whenM (atomically $ takeTMVar tooshortv) $ closeP2PConnection conn +servePutAction + :: P2PHttpServerState + -> P2PConnectionPair + -> B64Key + -> Maybe B64FilePath + -> (P2P.Protocol.Offset -> Proto (Maybe [UUID])) + -> IO (Either SomeException (Either ProtoFailure (Maybe [UUID]))) (Diff truncated)
diff --git a/doc/forum/Assistant___47___watch_doesn__39__t_create_symlinks.mdwn b/doc/forum/Assistant___47___watch_doesn__39__t_create_symlinks.mdwn new file mode 100644 index 0000000000..1828b91261 --- /dev/null +++ b/doc/forum/Assistant___47___watch_doesn__39__t_create_symlinks.mdwn @@ -0,0 +1,8 @@ +When I add a file manually with `git annex add` it works like expected: the file is replaced by a symlink pointing to a file in .git/objects. +But with assistant or watch, I end up with a copy in my working tree. + + +Is this by design? + +I use version 10.20240831-gb3dc65615318ffed194dfb85e631482d4515a180 +I tried it on a linux box with btrfs and on WSL2 with ext4.
p2p protocol version 4 for DATA-PRESENT
diff --git a/doc/design/p2p_protocol.mdwn b/doc/design/p2p_protocol.mdwn index 5b15142a52..d9cf75a02f 100644 --- a/doc/design/p2p_protocol.mdwn +++ b/doc/design/p2p_protocol.mdwn @@ -56,7 +56,7 @@ any authentication. The client sends the highest protocol version it supports: - VERSION 3 + VERSION 4 The server responds with the highest protocol version it supports that is less than or equal to the version the client sent: @@ -190,7 +190,8 @@ ALREADY-HAVE-PLUS. The subsequent list of UUIDs are additional UUIDs where the content is stored, in addition to the UUID where the client was going to send it. -Otherwise, it responds with: +When the server wants the client to send the content to it, +it replies with: PUT-FROM Offset @@ -200,7 +201,7 @@ the client to start. This allows resuming transfers. The client then sends a DATA message with content of the file from the offset to the end of file. -In protocol version 1 and above, after the data, the client sends an +In protocol version 1 and above, after the DATA, the client sends an additional message, to indicate if the content of the file has changed while it was being sent. @@ -213,6 +214,18 @@ it replies with SUCCESS. Otherwise, FAILURE. In protocol version 2 and above, the server can optionally reply with SUCCESS-PLUS and a list of UUIDs where the content was stored. +In protocol version 4 and above, rather than sending a DATA message, +the client can send the content to the repository in some other way +than using the protocol. To indicate that it has sent the content +like that, the client sends: + + DATA-PRESENT + +The server responds to DATA-PRESENT by checking if the content is +present in the repository. If so it proceeds the same as if the client +had used DATA to send the data and then sent VALID. Eg, it responds with +SUCCESS (or FAILURE if it cannot verify that the data is present). + ## Getting content from the server To get content from the server, the client sends: diff --git a/doc/design/p2p_protocol_over_http.mdwn b/doc/design/p2p_protocol_over_http.mdwn index 1c5ff28a61..0e49782fa0 100644 --- a/doc/design/p2p_protocol_over_http.mdwn +++ b/doc/design/p2p_protocol_over_http.mdwn @@ -49,7 +49,7 @@ over HTTPS. Requests are versioned. The versions correspond to P2P protocol versions. The version is part of the request path, -eg "v3" +eg "v4" If the server does not support a particular version, the request will fail with a 404, and the client should fall @@ -99,7 +99,7 @@ with 404 Not Found. Note that the common parameters bypass and clientuuid, while accepted, have no effect. Both are optional for this request. -### GET /git-annex/$uuid/v3/key/$key +### GET /git-annex/$uuid/v4/key/$key Get the content of a key from the repository with the specified uuid. @@ -144,20 +144,24 @@ an unlocked annexed file, which got modified while it was being sent. When the key is not present on the server, it will respond with 404 Not Found. +### GET /git-annex/$uuid/v3/key/$key + +Identical to v4. + ### GET /git-annex/$uuid/v2/key/$key -Identical to v3. +Identical to v4. ### GET /git-annex/$uuid/v1/key/$key -Identical to v3. +Identical to v4. ### GET /git-annex/$uuid/v0/key/$key -Same as v3, except the X-git-annex-data-length header is not used. +Same as v4, except the X-git-annex-data-length header is not used. Additional checking client-side will be required to validate the data. -### POST /git-annex/$uuid/v3/checkpresent +### POST /git-annex/$uuid/v4/checkpresent Checks if a key is currently present on the server. @@ -173,19 +177,23 @@ The body of the request is empty. The server responds with a JSON object with a "present" field that is true if the key is present, or false if it is not present. +### POST /git-annex/$uuid/v3/checkpresent + +Identical to v4. + ### POST /git-annex/$uuid/v2/checkpresent -Identical to v3. +Identical to v4. ### POST /git-annex/$uuid/v1/checkpresent -Identical to v3. +Identical to v4. ### POST /git-annex/$uuid/v0/checkpresent -Identical to v3. +Identical to v4. -### POST /git-annex/$uuid/v3/lockcontent +### POST /git-annex/$uuid/v4/lockcontent Locks the content of a key on the server, preventing it from being removed. @@ -203,19 +211,23 @@ The key will remain locked for 10 minutes. But, usually `keeplocked` is used to control the lifetime of the lock, using the "lockid" parameter from the server's reply. (See below.) +### POST /git-annex/$uuid/v3/lockcontent + +Identical to v4. + ### POST /git-annex/$uuid/v2/lockcontent -Identical to v3. +Identical to v4. ### POST /git-annex/$uuid/v1/lockcontent -Identical to v3. +Identical to v4. ### POST /git-annex/$uuid/v0/lockcontent -Identical to v3. +Identical to v4. -### POST /git-annex/$uuid/v3/keeplocked +### POST /git-annex/$uuid/v4/keeplocked Controls the lifetime of a lock on a key that was earlier obtained with `lockcontent`. @@ -252,21 +264,25 @@ locked for 10 minutes from the time it was first locked. Note that the common parameters bypass and clientuuid, while accepted, have no effect. +### POST /git-annex/$uuid/v3/keeplocked + +Identical to v4. + ### POST /git-annex/$uuid/v2/keeplocked -Identical to v3. +Identical to v4. ### POST /git-annex/$uuid/v1/keeplocked -Identical to v3. +Identical to v4. ### POST /git-annex/$uuid/v0/keeplocked -Identical to v3. +Identical to v4. -### POST /git-annex/$uuid/v3/remove +### POST /git-annex/$uuid/v4/remove -Remove a key's content from the server. +Remove a key's content from the repository. Example: @@ -284,21 +300,25 @@ or false if the key was not able to be removed. The JSON object can have an additional field "plusuuids" that is a list of UUIDs of other repositories that the content was removed from. +### POST /git-annex/$uuid/v3/remove + +Identical to v4. + ### POST /git-annex/$uuid/v2/remove -Identical to v3. (Diff truncated)
formatting
diff --git a/doc/design/passthrough_proxy.mdwn b/doc/design/passthrough_proxy.mdwn index ce90aa695c..7c57508136 100644 --- a/doc/design/passthrough_proxy.mdwn +++ b/doc/design/passthrough_proxy.mdwn @@ -778,12 +778,12 @@ reupload would overwrite the bad content. > > The P2P protocol could be extended by letting ALREADY-STORED be > sent by the client instead of DATA: -> -> PUT associatedfile key -> PUT-FROM 0 -> ALREADY-STORED -> SUCCESS -> + + PUT associatedfile key + PUT-FROM 0 + ALREADY-STORED + SUCCESS + > That lets the server send ALREADY-HAVE instead of PUT-FROM, preventing > the client from uploading content that is already present. And it can > send SUCCESS-PLUS at the end as well, or FAILURE if the checkpresent
update
diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index 02b62952d7..b355dd6755 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -21,12 +21,12 @@ Planned schedule of work: * July: p2p protocol over http * August, part 1: git-annex proxy support for exporttree * August, part 2: balanced preferred content -* October: proving behavior of balanced preferred content with proxies -* September: streaming through proxy to special remotes (especially S3) +* September: proving behavior of balanced preferred content with proxies +* October: streaming through proxy to special remotes (especially S3) [[!tag projects/openneuro]] -## remaining things to do +## remaining things to do in October * Streaming uploads to special remotes via the proxy. Possibly; if a workable design can be developed. It seems difficult without changing the @@ -44,22 +44,7 @@ Planned schedule of work: It should be possible to only write a single temp file. (With streaming this wouldn't be an issue.) -## completed items for October's work on streaming through proxy to special remotes - -* Stream downloads through proxy for all special remotes that indicate - they download in order. -* Added ORDERED message to external special remote protocol. - -## completed items for September's work on proving behavior of preferred content - -* Static analysis to detect "not present", "not balanced", and similar - unstable preferred content expressions and avoid problems with them. -* Implemented `git-annex sim` command. -* Simulated a variety of repository networks, and random preferred content - expressions, checking that a stable state is always reached. -* Fix bug that prevented anything being stored in an empty - repository whose preferred content expression uses sizebalanced. - (Identified via `git-annex sim`) +* Possibly some of the deferred items listed in following sections: ## items deferred until later for balanced preferred content and maxsize tracking @@ -78,38 +63,6 @@ Planned schedule of work: so it might be worth doing this optimisation. Persistent is not capable of this, would need dependency added on esquelito. -## completed items for August's work on balanced preferred content - -* Balanced preferred content basic implementation, including --rebalance - option. -* Implemented [[track_free_space_in_repos_via_git-annex_branch]] -* Implemented tracking of live changes to repository sizes. -* `git-annex maxsize` -* annex.fullybalancedthreshhold - -## completed items for August's work on git-annex proxy support for exporttre - -* Special remotes configured with exporttree=yes annexobjects=yes - can store objects in .git/annex/objects, as well as an exported tree. - -* Support proxying to special remotes configured with - exporttree=yes annexobjects=yes. - -* post-retrieve: When proxying is enabled for an exporttree=yes - special remote and the configured remote.name.annex-tracking-branch - is received, the tree is exported to the special remote. - -* When getting from a P2P HTTP remote, prompt for credentials when - required, instead of failing. - -* Prevent `updateproxy` and `updatecluster` from adding - an exporttree=yes special remote that does not have - annexobjects=yes, to avoid foot shooting. - -* Implement `git-annex export treeish --to=foo --from=bar`, which - gets from bar as needed to send to foo. Make post-retrieve use - `--to=r --from=r` to handle the multiple files case. - ## items deferred until later for p2p protocol over http * `git-annex p2phttp` should support serving several repositories at the same @@ -129,24 +82,6 @@ Planned schedule of work: * `git-annex p2phttp` could support systemd socket activation. This would allow making a systemd unit that listens on port 80. -## completed items for July's work on p2p protocol over http - -* HTTP P2P protocol design [[design/p2p_protocol_over_http]]. - -* addressed [[doc/todo/P2P_locking_connection_drop_safety]] - -* implemented server and client for HTTP P2P protocol - -* added git-annex p2phttp command to serve HTTP P2P protocol - -* Make git-annex p2phttp support https. - -* Allow using annex+http urls in remote.name.annexUrl - -* Make http server support proxying. - -* Make http server support serving a cluster. - ## items deferred until later for [[design/passthrough_proxy]] * Check annex.diskreserve when proxying for special remotes @@ -192,6 +127,74 @@ Planned schedule of work: * Support using a proxy when its url is a P2P address. (Eg tor-annex remotes.) + +## completed items for October's work on streaming through proxy to special remotes + +* Stream downloads through proxy for all special remotes that indicate + they download in order. +* Added ORDERED message to external special remote protocol. + +## completed items for September's work on proving behavior of preferred content + +* Static analysis to detect "not present", "not balanced", and similar + unstable preferred content expressions and avoid problems with them. +* Implemented `git-annex sim` command. +* Simulated a variety of repository networks, and random preferred content + expressions, checking that a stable state is always reached. +* Fix bug that prevented anything being stored in an empty + repository whose preferred content expression uses sizebalanced. + (Identified via `git-annex sim`) + +## completed items for August's work on balanced preferred content + +* Balanced preferred content basic implementation, including --rebalance + option. +* Implemented [[track_free_space_in_repos_via_git-annex_branch]] +* Implemented tracking of live changes to repository sizes. +* `git-annex maxsize` +* annex.fullybalancedthreshhold + +## completed items for August's work on git-annex proxy support for exporttre + +* Special remotes configured with exporttree=yes annexobjects=yes + can store objects in .git/annex/objects, as well as an exported tree. + +* Support proxying to special remotes configured with + exporttree=yes annexobjects=yes. + +* post-retrieve: When proxying is enabled for an exporttree=yes + special remote and the configured remote.name.annex-tracking-branch + is received, the tree is exported to the special remote. + +* When getting from a P2P HTTP remote, prompt for credentials when + required, instead of failing. + +* Prevent `updateproxy` and `updatecluster` from adding + an exporttree=yes special remote that does not have + annexobjects=yes, to avoid foot shooting. + +* Implement `git-annex export treeish --to=foo --from=bar`, which + gets from bar as needed to send to foo. Make post-retrieve use + `--to=r --from=r` to handle the multiple files case. + +## completed items for July's work on p2p protocol over http + +* HTTP P2P protocol design [[design/p2p_protocol_over_http]]. + +* addressed [[doc/todo/P2P_locking_connection_drop_safety]] + +* implemented server and client for HTTP P2P protocol + +* added git-annex p2phttp command to serve HTTP P2P protocol + +* Make git-annex p2phttp support https. + +* Allow using annex+http urls in remote.name.annexUrl + +* Make http server support proxying. + +* Make http server support serving a cluster. + ## completed items for June's work on [[design/passthrough_proxy]]: * UUID discovery via git-annex branch. Add a log file listing UUIDs
simplified design for indirect uploads
diff --git a/doc/design/passthrough_proxy.mdwn b/doc/design/passthrough_proxy.mdwn index f73ec9a45b..ce90aa695c 100644 --- a/doc/design/passthrough_proxy.mdwn +++ b/doc/design/passthrough_proxy.mdwn @@ -685,6 +685,9 @@ When a client wants to upload an object, the proxy could indicate that the upload should not be sent to it, but instead be PUT to a HTTP url that it provides to the client. +(This would presumably only be used with unencrypted and unchunked special +remotes.) + An example use case involves [presigned S3 urls](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-presigned-url.html). When the proxy is to a S3 bucket, having the client upload @@ -739,3 +742,49 @@ in cases like OpenNeuro's, where something other than git-annex is communicating with the git-annex proxy. Alternatively, allow the wrong content to be sent to the url, but then indicate that with INVALID. A later reupload would overwrite the bad content. + +> Something like this can already be accomplished another way: Don't change +> the protocol at all, have the client generate the presigned url itself +> (or request it from something other than git-annex), upload the object, +> and update the git-annex branch to indicate it's stored in the S3 +> special remote. +> +> That needs the client to be aware of what filename to use in the S3 +> bucket (either a git-annex key or the exported filename depending on the +> special remote configuration). And it has to know how to update the +> git-annex location log. And for exporttree remotes, the export log. +> So effectively, the client needs to either be git-annex or implement a +> decent amount of its internals, or git-annex would need some additional +> plumbing commands for the client to use. (If the client is javascript +> running in the browser, it would be difficult for it to run git-annex +> though.) +> +> Perhaps there is something in the middle between these two extremes. +> Extend the P2P protocol to let the client indicate it has +> uploaded a key to the remote. The proxy then updates the git-annex branch +> to reflect that the upload happened. (Maybe it uses checkpresent to +> verify it first.) +> +> This leaves it up to the client to understand what filename to +> use to store a key in the S3 bucket (or wherever). For an exporttree=yes +> remote, it's simply the file being added, and for other remotes, +> `git-annex examinekey` can be used. Perhaps the protocol could indicate +> the filename for the client to use. But generally what filename or whatever +> to use for a key in a special remote is something only the special +> remote's implementation knows about, there is not an interface to get it. +> In practice, there are a few common patterns and anyway this would only +> be used with some particular special remote, like S3, that the client +> understands how to write to. +> +> The P2P protocol could be extended by letting ALREADY-STORED be +> sent by the client instead of DATA: +> +> PUT associatedfile key +> PUT-FROM 0 +> ALREADY-STORED +> SUCCESS +> +> That lets the server send ALREADY-HAVE instead of PUT-FROM, preventing +> the client from uploading content that is already present. And it can +> send SUCCESS-PLUS at the end as well, or FAILURE if the checkpresent +> verification fails.
fix reversion in getting from unchunked encrypted special remotes
Have to use the object file for the encrypted key, not the unencrypted
key.
Bug introduced in 835283b86240ef4b68c44f4332ac1b644e08e49f
Have to use the object file for the encrypted key, not the unencrypted
key.
Bug introduced in 835283b86240ef4b68c44f4332ac1b644e08e49f
diff --git a/Remote/Helper/Chunked.hs b/Remote/Helper/Chunked.hs index 7ebdfdccef..9b40d5b10c 100644 --- a/Remote/Helper/Chunked.hs +++ b/Remote/Helper/Chunked.hs @@ -328,7 +328,9 @@ retrieveChunks retriever u vc chunkconfig encryptor basek dest basep enc encc iv <- startVerifyKeyContentIncrementally vc basek case enc of Just _ -> do - retriever (encryptor basek) basep (toRawFilePath dest) Nothing $ + let enck = encryptor basek + objloc <- fromRepo $ gitAnnexTmpObjectLocation enck + retriever (encryptor basek) basep objloc Nothing $ retrieved iv Nothing basep return (Right iv) -- Not chunked and not encrypted, so ask the diff --git a/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg.mdwn b/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg.mdwn index f35837148b..44d4050978 100644 --- a/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg.mdwn +++ b/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg.mdwn @@ -32,3 +32,5 @@ failed I tried to rerun build within that env -- the same + +> [[fixed|done]] --[[Joey]] diff --git a/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg/comment_1_a59ada9b508ec7828bbf3a9c8fbd2c2c._comment b/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg/comment_1_a59ada9b508ec7828bbf3a9c8fbd2c2c._comment index fe941419cc..27b4263e8d 100644 --- a/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg/comment_1_a59ada9b508ec7828bbf3a9c8fbd2c2c._comment +++ b/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg/comment_1_a59ada9b508ec7828bbf3a9c8fbd2c2c._comment @@ -10,4 +10,6 @@ The file is mode 444, which is a permission git-annex sets. Then git-annex opens it for write. Oops.. Bisected to [[!commit 835283b86240ef4b68c44f4332ac1b644e08e49f]]. + +Fixed. """]]
bisected
diff --git a/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg/comment_1_a59ada9b508ec7828bbf3a9c8fbd2c2c._comment b/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg/comment_1_a59ada9b508ec7828bbf3a9c8fbd2c2c._comment index a97a83da0f..fe941419cc 100644 --- a/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg/comment_1_a59ada9b508ec7828bbf3a9c8fbd2c2c._comment +++ b/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg/comment_1_a59ada9b508ec7828bbf3a9c8fbd2c2c._comment @@ -8,4 +8,6 @@ special remotes, not just the test suite. The file is mode 444, which is a permission git-annex sets. Then git-annex opens it for write. Oops.. + +Bisected to [[!commit 835283b86240ef4b68c44f4332ac1b644e08e49f]]. """]]
reproduced
diff --git a/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg/comment_1_a59ada9b508ec7828bbf3a9c8fbd2c2c._comment b/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg/comment_1_a59ada9b508ec7828bbf3a9c8fbd2c2c._comment new file mode 100644 index 0000000000..a97a83da0f --- /dev/null +++ b/doc/bugs/v10_locked_gpg_crypto_test_fails_while_debian_pkg/comment_1_a59ada9b508ec7828bbf3a9c8fbd2c2c._comment @@ -0,0 +1,11 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2024-10-28T15:29:56Z" + content=""" +Reproducible here. Also this does break getting files from encrypted +special remotes, not just the test suite. + +The file is mode 444, which is a permission git-annex sets. Then git-annex +opens it for write. Oops.. +"""]]